From 18d4ecdd9cb59e21ebfc7561bee7215b1e822459 Mon Sep 17 00:00:00 2001 From: Jakub Binkowski Date: Mon, 3 Jun 2024 16:32:39 +0200 Subject: [PATCH] Finish sft experiments (#21) * Added notebook for inspecting SFT results * Add llama-unsloth to zero-shot eval * Reproduce prediction on 1500 step of Unsloth-Mistral * Add Bielik LLM --- configs/model/Bielik-7B-Instruct-v0.1.yaml | 8 + ...h-Mistral-7B-Instruct-v0.3-fine-tuned.yaml | 10 + .../.gitignore | 1 + .../predict/pl-court-instruct/.gitignore | 3 + .../metrics_Bielik-7B-Instruct-v0.1.json | 12 + .../metrics_Unsloth-Llama-3-8B-Instruct.json | 12 + ...h-Mistral-7B-Instruct-v0.3-fine-tuned.json | 12 + .../pl-court-instruct/metrics_summary.md | 16 +- dvc.lock | 141 +++++++- dvc.yaml | 9 +- nbs/Data/02_Analyse_sft.ipynb | 312 ++++++++++++++++++ 11 files changed, 524 insertions(+), 12 deletions(-) create mode 100644 configs/model/Bielik-7B-Instruct-v0.1.yaml create mode 100644 configs/model/Unsloth-Mistral-7B-Instruct-v0.3-fine-tuned.yaml create mode 100644 data/experiments/fine-tune/Unsloth-Mistral-7B-Instruct-v0.3/.gitignore create mode 100644 data/experiments/predict/pl-court-instruct/metrics_Bielik-7B-Instruct-v0.1.json create mode 100644 data/experiments/predict/pl-court-instruct/metrics_Unsloth-Llama-3-8B-Instruct.json create mode 100644 data/experiments/predict/pl-court-instruct/metrics_Unsloth-Mistral-7B-Instruct-v0.3-fine-tuned.json create mode 100644 nbs/Data/02_Analyse_sft.ipynb diff --git a/configs/model/Bielik-7B-Instruct-v0.1.yaml b/configs/model/Bielik-7B-Instruct-v0.1.yaml new file mode 100644 index 0000000..aa7a872 --- /dev/null +++ b/configs/model/Bielik-7B-Instruct-v0.1.yaml @@ -0,0 +1,8 @@ +name: speakleash/Bielik-7B-Instruct-v0.1 +tokenizer_name: ${.name} + +adapter_path: null + +max_seq_length: 4_000 +padding: longest +batch_size: 1 diff --git a/configs/model/Unsloth-Mistral-7B-Instruct-v0.3-fine-tuned.yaml b/configs/model/Unsloth-Mistral-7B-Instruct-v0.3-fine-tuned.yaml new file mode 100644 index 0000000..91c2c7a --- /dev/null +++ b/configs/model/Unsloth-Mistral-7B-Instruct-v0.3-fine-tuned.yaml @@ -0,0 +1,10 @@ +name: unsloth/mistral-7b-instruct-v0.3-bnb-4bit +tokenizer_name: ${.name} + +adapter_path: data/experiments/fine-tune/Unsloth-Mistral-7B-Instruct-v0.3/pl-court-instruct/checkpoint-1500 + +max_seq_length: 20_000 +padding: longest +batch_size: 1 + +use_unsloth: true diff --git a/data/experiments/fine-tune/Unsloth-Mistral-7B-Instruct-v0.3/.gitignore b/data/experiments/fine-tune/Unsloth-Mistral-7B-Instruct-v0.3/.gitignore new file mode 100644 index 0000000..c5110ed --- /dev/null +++ b/data/experiments/fine-tune/Unsloth-Mistral-7B-Instruct-v0.3/.gitignore @@ -0,0 +1 @@ +/pl-court-instruct diff --git a/data/experiments/predict/pl-court-instruct/.gitignore b/data/experiments/predict/pl-court-instruct/.gitignore index f2c04e6..283c20e 100644 --- a/data/experiments/predict/pl-court-instruct/.gitignore +++ b/data/experiments/predict/pl-court-instruct/.gitignore @@ -6,3 +6,6 @@ /outputs_Mistral-7B-Instruct-v0.2-fine-tuned.json /outputs_Unsloth-Llama-3-8B-Instruct-fine-tuned.json /outputs_Unsloth-Mistral-7B-Instruct-v0.3.json +/outputs_Unsloth-Mistral-7B-Instruct-v0.3-fine-tuned.json +/outputs_Unsloth-Llama-3-8B-Instruct.json +/outputs_Bielik-7B-Instruct-v0.1.json diff --git a/data/experiments/predict/pl-court-instruct/metrics_Bielik-7B-Instruct-v0.1.json b/data/experiments/predict/pl-court-instruct/metrics_Bielik-7B-Instruct-v0.1.json new file mode 100644 index 0000000..e6d130c --- /dev/null +++ b/data/experiments/predict/pl-court-instruct/metrics_Bielik-7B-Instruct-v0.1.json @@ -0,0 +1,12 @@ +{ + "full_text_chrf": 0.2468319535255432, + "field_chrf": { + "court_name": 0.7368742823600769, + "date": 0.7829525470733643, + "department_name": 0.626532793045044, + "judges": 0.30981674790382385, + "legal_bases": 0.3045749366283417, + "recorder": 0.5168337821960449, + "signature": 0.4849330484867096 + } +} \ No newline at end of file diff --git a/data/experiments/predict/pl-court-instruct/metrics_Unsloth-Llama-3-8B-Instruct.json b/data/experiments/predict/pl-court-instruct/metrics_Unsloth-Llama-3-8B-Instruct.json new file mode 100644 index 0000000..0e8eb94 --- /dev/null +++ b/data/experiments/predict/pl-court-instruct/metrics_Unsloth-Llama-3-8B-Instruct.json @@ -0,0 +1,12 @@ +{ + "full_text_chrf": 0.4385761320590973, + "field_chrf": { + "court_name": 0.8789530396461487, + "date": 0.9822721481323242, + "department_name": 0.9057374000549316, + "judges": 0.9149863123893738, + "legal_bases": 0.42645466327667236, + "recorder": 0.7640316486358643, + "signature": 0.7549777626991272 + } +} \ No newline at end of file diff --git a/data/experiments/predict/pl-court-instruct/metrics_Unsloth-Mistral-7B-Instruct-v0.3-fine-tuned.json b/data/experiments/predict/pl-court-instruct/metrics_Unsloth-Mistral-7B-Instruct-v0.3-fine-tuned.json new file mode 100644 index 0000000..e0a55ee --- /dev/null +++ b/data/experiments/predict/pl-court-instruct/metrics_Unsloth-Mistral-7B-Instruct-v0.3-fine-tuned.json @@ -0,0 +1,12 @@ +{ + "full_text_chrf": 0.8193286061286926, + "field_chrf": { + "court_name": 0.9964265823364258, + "date": 0.9885857701301575, + "department_name": 0.9962303042411804, + "judges": 0.981475830078125, + "legal_bases": 0.7374544143676758, + "recorder": 0.9933416843414307, + "signature": 0.9780842661857605 + } +} \ No newline at end of file diff --git a/data/experiments/predict/pl-court-instruct/metrics_summary.md b/data/experiments/predict/pl-court-instruct/metrics_summary.md index 497a5be..b6e1f18 100644 --- a/data/experiments/predict/pl-court-instruct/metrics_summary.md +++ b/data/experiments/predict/pl-court-instruct/metrics_summary.md @@ -1,7 +1,9 @@ -| llm | full_text_chrf | court_name | date | department_name | judges | legal_bases | recorder | signature | -|:---------------------------------------|-----------------:|-------------:|-------:|------------------:|---------:|--------------:|-----------:|------------:| -| Meta-Llama-3-8B-Instruct | 0.247 | 0.862 | 0.971 | 0.833 | 0.882 | 0.287 | 0.805 | 0.778 | -| Mistral-7B-Instruct-v0.2 | 0.432 | 0.839 | 0.922 | 0.850 | 0.879 | 0.333 | 0.837 | 0.145 | -| Mistral-7B-Instruct-v0.2-fine-tuned | 0.772 | 0.987 | 0.990 | 0.965 | 0.952 | 0.600 | 0.979 | 0.972 | -| Unsloth-Llama-3-8B-Instruct-fine-tuned | 0.828 | 0.995 | 0.989 | 0.986 | 0.977 | 0.601 | 0.993 | 0.994 | -| Unsloth-Mistral-7B-Instruct-v0.3 | 0.477 | 0.830 | 0.987 | 0.900 | 0.870 | 0.419 | 0.943 | 0.567 | \ No newline at end of file +| llm | full_text_chrf | court_name | date | department_name | judges | legal_bases | recorder | signature | +|:--------------------------------------------|-----------------:|-------------:|-------:|------------------:|---------:|--------------:|-----------:|------------:| +| Meta-Llama-3-8B-Instruct | 0.247 | 0.862 | 0.971 | 0.833 | 0.882 | 0.287 | 0.805 | 0.778 | +| Mistral-7B-Instruct-v0.2 | 0.432 | 0.839 | 0.922 | 0.850 | 0.879 | 0.333 | 0.837 | 0.145 | +| Mistral-7B-Instruct-v0.2-fine-tuned | 0.772 | 0.987 | 0.990 | 0.965 | 0.952 | 0.600 | 0.979 | 0.972 | +| Unsloth-Llama-3-8B-Instruct | 0.439 | 0.879 | 0.982 | 0.906 | 0.915 | 0.426 | 0.764 | 0.755 | +| Unsloth-Llama-3-8B-Instruct-fine-tuned | 0.828 | 0.995 | 0.989 | 0.986 | 0.977 | 0.601 | 0.993 | 0.994 | +| Unsloth-Mistral-7B-Instruct-v0.3 | 0.477 | 0.830 | 0.987 | 0.900 | 0.870 | 0.419 | 0.943 | 0.567 | +| Unsloth-Mistral-7B-Instruct-v0.3-fine-tuned | 0.819 | 0.996 | 0.989 | 0.996 | 0.981 | 0.737 | 0.993 | 0.978 | \ No newline at end of file diff --git a/dvc.lock b/dvc.lock index 848d8a3..1587886 100644 --- a/dvc.lock +++ b/dvc.lock @@ -320,8 +320,8 @@ stages: outs: - path: data/experiments/predict/pl-court-instruct/metrics_summary.md hash: md5 - md5: a72452f53099f61de9d653af1a596a3a - size: 1119 + md5: 80c3922982cb8a41468063481dbf695c + size: 1484 evaluate@Unsloth-Mistral-7B-Instruct-v0.3: cmd: PYTHONPATH=. python scripts/sft/evaluate.py --output-file data/experiments/predict/pl-court-instruct/outputs_Unsloth-Mistral-7B-Instruct-v0.3.json @@ -341,3 +341,140 @@ stages: hash: md5 md5: 091b8888275600052dd2dcdd36a55588 size: 305 + predict@Unsloth-Mistral-7B-Instruct-v0.3-fine-tuned: + cmd: PYTHONPATH=. python scripts/sft/predict.py model=Unsloth-Mistral-7B-Instruct-v0.3-fine-tuned + deps: + - path: configs/model/Unsloth-Mistral-7B-Instruct-v0.3-fine-tuned.yaml + hash: md5 + md5: 8e8b380ef9bc65715cb833ce104cda20 + size: 256 + - path: configs/predict.yaml + hash: md5 + md5: e6b047cf62e612a32381d6221eb99b4e + size: 416 + - path: scripts/sft/predict.py + hash: md5 + md5: 69e4844a715c9c5c75e1127a06472ad4 + size: 3148 + outs: + - path: + data/experiments/predict/pl-court-instruct/outputs_Unsloth-Mistral-7B-Instruct-v0.3-fine-tuned.json + hash: md5 + md5: a4fda5774b367e8924cf07f3bf271922 + size: 1834778 + evaluate@Unsloth-Mistral-7B-Instruct-v0.3-fine-tuned: + cmd: PYTHONPATH=. python scripts/sft/evaluate.py --output-file + data/experiments/predict/pl-court-instruct/outputs_Unsloth-Mistral-7B-Instruct-v0.3-fine-tuned.json + deps: + - path: + data/experiments/predict/pl-court-instruct/outputs_Unsloth-Mistral-7B-Instruct-v0.3-fine-tuned.json + hash: md5 + md5: a4fda5774b367e8924cf07f3bf271922 + size: 1834778 + - path: scripts/sft/evaluate.py + hash: md5 + md5: 5ee442a9a3525af7596bf24c3d724a1d + size: 570 + outs: + - path: + data/experiments/predict/pl-court-instruct/metrics_Unsloth-Mistral-7B-Instruct-v0.3-fine-tuned.json + hash: md5 + md5: 3b3589929112cb2f199044d240e87bcc + size: 305 + predict@Unsloth-Llama-3-8B-Instruct: + cmd: PYTHONPATH=. python scripts/sft/predict.py model=Unsloth-Llama-3-8B-Instruct + deps: + - path: configs/model/Unsloth-Llama-3-8B-Instruct.yaml + hash: md5 + md5: e97bb2e6bf39f75edea7714d6ba58b77 + size: 160 + - path: configs/predict.yaml + hash: md5 + md5: e6b047cf62e612a32381d6221eb99b4e + size: 416 + - path: scripts/sft/predict.py + hash: md5 + md5: 69e4844a715c9c5c75e1127a06472ad4 + size: 3148 + outs: + - path: + data/experiments/predict/pl-court-instruct/outputs_Unsloth-Llama-3-8B-Instruct.json + hash: md5 + md5: df2f1d464152f87737c8ebb5b0673854 + size: 2179383 + evaluate@Unsloth-Llama-3-8B-Instruct: + cmd: PYTHONPATH=. python scripts/sft/evaluate.py --output-file + data/experiments/predict/pl-court-instruct/outputs_Unsloth-Llama-3-8B-Instruct.json + deps: + - path: + data/experiments/predict/pl-court-instruct/outputs_Unsloth-Llama-3-8B-Instruct.json + hash: md5 + md5: df2f1d464152f87737c8ebb5b0673854 + size: 2179383 + - path: scripts/sft/evaluate.py + hash: md5 + md5: 5ee442a9a3525af7596bf24c3d724a1d + size: 570 + outs: + - path: + data/experiments/predict/pl-court-instruct/metrics_Unsloth-Llama-3-8B-Instruct.json + hash: md5 + md5: 521a731cc2c45d3eda0656a8e69d505b + size: 307 + predict@Bielik-7B-Instruct-v0.1: + cmd: PYTHONPATH=. python scripts/sft/predict.py model=Bielik-7B-Instruct-v0.1 + deps: + - path: configs/model/Bielik-7B-Instruct-v0.1.yaml + hash: md5 + md5: ea2309177451ac16db4c2c7a5b7aed3b + size: 140 + - path: configs/predict.yaml + hash: md5 + md5: e6b047cf62e612a32381d6221eb99b4e + size: 416 + - path: scripts/sft/predict.py + hash: md5 + md5: 69e4844a715c9c5c75e1127a06472ad4 + size: 3148 + outs: + - path: data/experiments/predict/pl-court-instruct/outputs_Bielik-7B-Instruct-v0.1.json + hash: md5 + md5: 58f1b7a5d06cca3989c8b373c5429162 + size: 2033178 + sft_unsloth@Unsloth-Mistral-7B-Instruct-v0.3: + cmd: PYTHONPATH=. python scripts/sft/fine_tune_unsloth.py model=Unsloth-Mistral-7B-Instruct-v0.3 + deps: + - path: configs/fine_tuning.yaml + hash: md5 + md5: 9cd6fd320530e1c8ded7d9c369b8a082 + size: 440 + - path: configs/model/Unsloth-Mistral-7B-Instruct-v0.3.yaml + hash: md5 + md5: 71dbbb0a8a2454c7c0210e2d1acd859d + size: 167 + - path: scripts/sft/fine_tune_unsloth.py + hash: md5 + md5: c8a06fdcb01188a621b5fc9cc579ea56 + size: 6904 + outs: + - path: data/experiments/fine-tune/Unsloth-Mistral-7B-Instruct-v0.3/pl-court-instruct + hash: md5 + md5: 914a39b11765124b6548bfa3f5ef64e1.dir + size: 4084044746 + nfiles: 192 + evaluate@Bielik-7B-Instruct-v0.1: + cmd: PYTHONPATH=. python scripts/sft/evaluate.py --output-file data/experiments/predict/pl-court-instruct/outputs_Bielik-7B-Instruct-v0.1.json + deps: + - path: data/experiments/predict/pl-court-instruct/outputs_Bielik-7B-Instruct-v0.1.json + hash: md5 + md5: 58f1b7a5d06cca3989c8b373c5429162 + size: 2033178 + - path: scripts/sft/evaluate.py + hash: md5 + md5: 5ee442a9a3525af7596bf24c3d724a1d + size: 570 + outs: + - path: data/experiments/predict/pl-court-instruct/metrics_Bielik-7B-Instruct-v0.1.json + hash: md5 + md5: 2d1b6a392152f2e022a33553265e141a + size: 306 diff --git a/dvc.yaml b/dvc.yaml index 43b34df..27960b4 100644 --- a/dvc.yaml +++ b/dvc.yaml @@ -58,8 +58,11 @@ stages: - Meta-Llama-3-8B-Instruct - Mistral-7B-Instruct-v0.2 - Mistral-7B-Instruct-v0.2-fine-tuned + - Bielik-7B-Instruct-v0.1 + - Unsloth-Llama-3-8B-Instruct - Unsloth-Llama-3-8B-Instruct-fine-tuned - Unsloth-Mistral-7B-Instruct-v0.3 + - Unsloth-Mistral-7B-Instruct-v0.3-fine-tuned cmd: >- PYTHONPATH=. python scripts/sft/predict.py model=${item.model} deps: @@ -72,11 +75,11 @@ stages: evaluate: matrix: model: - - Meta-Llama-3-8B-Instruct - - Mistral-7B-Instruct-v0.2 - - Mistral-7B-Instruct-v0.2-fine-tuned + - Unsloth-Llama-3-8B-Instruct - Unsloth-Llama-3-8B-Instruct-fine-tuned - Unsloth-Mistral-7B-Instruct-v0.3 + - Unsloth-Mistral-7B-Instruct-v0.3-fine-tuned + - Bielik-7B-Instruct-v0.1 cmd: >- PYTHONPATH=. python scripts/sft/evaluate.py --output-file data/experiments/predict/pl-court-instruct/outputs_${item.model}.json diff --git a/nbs/Data/02_Analyse_sft.ipynb b/nbs/Data/02_Analyse_sft.ipynb new file mode 100644 index 0000000..5e77e17 --- /dev/null +++ b/nbs/Data/02_Analyse_sft.ipynb @@ -0,0 +1,312 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import warnings\n", + "import json\n", + "from multiprocessing import Pool\n", + "from statistics import mean\n", + "from typing import Any\n", + "from pathlib import Path\n", + "\n", + "import pandas as pd\n", + "from tqdm.auto import tqdm\n", + "from ipywidgets import interact\n", + "\n", + "from juddges.utils.misc import parse_yaml\n", + "from juddges.metrics.info_extraction import evaluate_extraction\n", + "\n", + "pd.options.display.float_format = '{:,.3f}'.format\n", + "warnings.filterwarnings('ignore', message=\"To copy construct from a tensor, it is recommended to use\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Compare metrics" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
llmfull_text_chrfcourt_namedatedepartment_namejudgeslegal_basesrecordersignature
2Meta-Llama-3-8B-Instruct0.2470.8620.9710.8330.8820.2870.8050.778
0Mistral-7B-Instruct-v0.20.4320.8390.9220.8500.8790.3330.8370.145
3Mistral-7B-Instruct-v0.2-fine-tuned0.7720.9870.9900.9650.9520.6000.9790.972
4Unsloth-Llama-3-8B-Instruct-fine-tuned0.8280.9950.9890.9860.9770.6010.9930.994
1Unsloth-Mistral-7B-Instruct-v0.30.4770.8300.9870.9000.8700.4190.9430.567
5Unsloth-Mistral-7B-Instruct-v0.3-fine-tuned0.7980.9950.9880.9860.9670.6080.9870.976
\n", + "
" + ], + "text/plain": [ + " llm full_text_chrf court_name \\\n", + "2 Meta-Llama-3-8B-Instruct 0.247 0.862 \n", + "0 Mistral-7B-Instruct-v0.2 0.432 0.839 \n", + "3 Mistral-7B-Instruct-v0.2-fine-tuned 0.772 0.987 \n", + "4 Unsloth-Llama-3-8B-Instruct-fine-tuned 0.828 0.995 \n", + "1 Unsloth-Mistral-7B-Instruct-v0.3 0.477 0.830 \n", + "5 Unsloth-Mistral-7B-Instruct-v0.3-fine-tuned 0.798 0.995 \n", + "\n", + " date department_name judges legal_bases recorder signature \n", + "2 0.971 0.833 0.882 0.287 0.805 0.778 \n", + "0 0.922 0.850 0.879 0.333 0.837 0.145 \n", + "3 0.990 0.965 0.952 0.600 0.979 0.972 \n", + "4 0.989 0.986 0.977 0.601 0.993 0.994 \n", + "1 0.987 0.900 0.870 0.419 0.943 0.567 \n", + "5 0.988 0.986 0.967 0.608 0.987 0.976 " + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "results = []\n", + "for f in Path(\"data/experiments/predict/pl-court-instruct\").glob(\"metrics_*.json\"):\n", + " model_name = f.stem.replace(\"metrics_\", \"\")\n", + " with f.open() as file:\n", + " m_res = json.load(file)\n", + " results.append(\n", + " {\"llm\": model_name}\n", + " | {\"full_text_chrf\": m_res[\"full_text_chrf\"]}\n", + " | m_res[\"field_chrf\"]\n", + " )\n", + "\n", + "pd.DataFrame(results).sort_values(\"llm\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Inspect results" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "OUTPUTS_PATH = \"data/experiments/predict/pl-court-instruct/outputs_Unsloth-Llama-3-8B-Instruct-fine-tuned.json\"\n", + "\n", + "with open(OUTPUTS_PATH) as file:\n", + " data = json.load(file)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "26e97e0494064714ab86ea2f0ea4b0b6", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/2000 [00:00 dict[str, Any]:\n", + " item[\"metrics\"] = evaluate_extraction([item])\n", + " item[\"metrics\"][\"mean_field\"] = mean(item[\"metrics\"][\"field_chrf\"].values())\n", + " item[\"gold\"] = parse_yaml(item[\"gold\"])\n", + " try:\n", + " item[\"answer\"] = parse_yaml(item[\"answer\"])\n", + " except:\n", + " item[\"answer\"] = None\n", + " return item\n", + "\n", + "num_invalid_answers = 0\n", + "results = []\n", + "with Pool(10) as pool:\n", + " for item in tqdm(pool.imap(eval_item, data), total=len(data)):\n", + " results.append(item)\n", + " if item[\"answer\"] is None:\n", + " num_invalid_answers += 1\n", + "\n", + "print(f\"Number of invalid answers: {num_invalid_answers} / {len(data)}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "26fcb134773c4fbc9fbb60645895253c", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "interactive(children=(Dropdown(description='idx', options=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 1…" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "data_valid = [item for item in results if item[\"answer\"] is not None]\n", + "data_valid = sorted(data_valid, key=lambda x: x[\"metrics\"][\"mean_field\"])\n", + "\n", + "def item_to_df(idx: int) -> pd.DataFrame:\n", + " item = data_valid[idx]\n", + " return pd.DataFrame({\n", + " \"gold\": item[\"gold\"],\n", + " \"answer\": item[\"answer\"],\n", + " \"metrics\": item[\"metrics\"][\"field_chrf\"],\n", + " })\n", + "\n", + "\n", + "interact(item_to_df, idx=range(len(data_valid)));" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "juddges", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.9" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}