diff --git a/configs/embedding.yaml b/configs/embedding.yaml
index 37afafe..cb416ec 100644
--- a/configs/embedding.yaml
+++ b/configs/embedding.yaml
@@ -10,6 +10,7 @@ chunk_config:
   chunk_size: ${embedding_model.max_seq_length}
   min_split_chars: 10
   take_n_first_chunks: 16
+  chunk_overlap: 32
 batch_size: 64
 
 output_dir: data/embeddings/${dataset.name}/${hydra:runtime.choices.embedding_model}/all_embeddings
diff --git a/dvc.lock b/dvc.lock
index 7c5fb51..47c5437 100644
--- a/dvc.lock
+++ b/dvc.lock
@@ -65,29 +65,29 @@ stages:
   embed@mmlw-roberta-large:
     cmd: PYTHONPATH=. python scripts/embed/embed_text.py embedding_model=mmlw-roberta-large
     deps:
-    - hash: md5
-      md5: 22fa56f7d7d5a1c1372a8a8b57b02ba8
-      path: configs/embedding.yaml
-      size: 467
-    - hash: md5
+    - path: configs/embedding.yaml
+      hash: md5
+      md5: 9a163f8656c6efa150fd7f939bb32e49
+      size: 477
+    - path: configs/embedding_model/mmlw-roberta-large.yaml
+      hash: md5
       md5: 22f36cfd196c0fdc3cfd8a036d52b606
-      path: configs/embedding_model/mmlw-roberta-large.yaml
       size: 52
-    - hash: md5
-      md5: 5dd44be2eea852bcce3d0918ff8b97da.dir
+    - path: data/datasets/pl/raw
+      hash: md5
+      md5: 622ba21868561c26fb6877ad95bfb5c5.dir
+      size: 10234505621
       nfiles: 17
-      path: data/datasets/pl/raw
-      size: 10234880729
-    - hash: md5
-      md5: a2953ae4974ef96d62063b5c2711e967
-      path: scripts/embed/embed_text.py
-      size: 3549
-    outs:
-    - hash: md5
-      md5: 1a086db46b90b0f3c4c66c3ecefe8adb.dir
-      nfiles: 53
-      path: data/embeddings/pl-court-raw/mmlw-roberta-large/all_embeddings
-      size: 24415235644
+    - path: scripts/embed/embed_text.py
+      hash: md5
+      md5: d9f127f2e92afa40f23ebcd6cf540cb9
+      size: 3743
+    outs:
+    - path: data/embeddings/pl-court-raw/mmlw-roberta-large/all_embeddings
+      hash: md5
+      md5: a8a4a370199cce269899df89f4e33fdc.dir
+      size: 23430894782
+      nfiles: 51
   evaluate_api_models@en-court-instruct-open_ai_gpt-4o-997:
     cmd: PYTHONPATH=. python scripts/sft/evaluate.py --output-file data/experiments/predict/en-court-instruct/open_ai_gpt-4o/outputs_997.json
       --num-proc=-1
@@ -157,12 +157,14 @@ stages:
       path: data/experiments/predict/pl-court-instruct/open_ai_gpt-4o-mini/metrics_997.json
       size: 305
   evaluate_en@en-court-instruct-Unsloth-Llama-3-8B-Instruct-42:
-    cmd: PYTHONPATH=. python scripts/sft/evaluate.py --output-file data/experiments/predict/en-court-instruct/Unsloth-Llama-3-8B-Instruct/outputs_42.json
+    cmd: PYTHONPATH=. python scripts/sft/evaluate.py --output-file
+      data/experiments/predict/en-court-instruct/Unsloth-Llama-3-8B-Instruct/outputs_42.json
       --num-proc=-1
     deps:
     - hash: md5
       md5: 761018c0a306fbee63dad2fbc119110d
-      path: data/experiments/predict/en-court-instruct/Unsloth-Llama-3-8B-Instruct/outputs_42.json
+      path:
+        data/experiments/predict/en-court-instruct/Unsloth-Llama-3-8B-Instruct/outputs_42.json
       size: 821683
     - hash: md5
       md5: 0644efb76af2c5461185e37a07ba2c17
@@ -171,15 +173,18 @@ stages:
     outs:
     - hash: md5
       md5: 265776ba10a7b24b66e6bac1131e0c48
-      path: data/experiments/predict/en-court-instruct/Unsloth-Llama-3-8B-Instruct/metrics_42.json
+      path:
+        data/experiments/predict/en-court-instruct/Unsloth-Llama-3-8B-Instruct/metrics_42.json
       size: 149
   evaluate_en@en-court-instruct-Unsloth-Llama-3-8B-Instruct-7312:
-    cmd: PYTHONPATH=. python scripts/sft/evaluate.py --output-file data/experiments/predict/en-court-instruct/Unsloth-Llama-3-8B-Instruct/outputs_7312.json
+    cmd: PYTHONPATH=. python scripts/sft/evaluate.py --output-file
+      data/experiments/predict/en-court-instruct/Unsloth-Llama-3-8B-Instruct/outputs_7312.json
       --num-proc=-1
     deps:
     - hash: md5
       md5: a7361535b440251d6ce6232a15cfcdf2
-      path: data/experiments/predict/en-court-instruct/Unsloth-Llama-3-8B-Instruct/outputs_7312.json
+      path:
+        data/experiments/predict/en-court-instruct/Unsloth-Llama-3-8B-Instruct/outputs_7312.json
       size: 818877
     - hash: md5
       md5: 0644efb76af2c5461185e37a07ba2c17
@@ -188,15 +193,18 @@ stages:
     outs:
     - hash: md5
       md5: 97fa8dfaa5e57633e8fb6a7d073177f5
-      path: data/experiments/predict/en-court-instruct/Unsloth-Llama-3-8B-Instruct/metrics_7312.json
+      path:
+        data/experiments/predict/en-court-instruct/Unsloth-Llama-3-8B-Instruct/metrics_7312.json
       size: 147
   evaluate_en@en-court-instruct-Unsloth-Llama-3-8B-Instruct-997:
-    cmd: PYTHONPATH=. python scripts/sft/evaluate.py --output-file data/experiments/predict/en-court-instruct/Unsloth-Llama-3-8B-Instruct/outputs_997.json
+    cmd: PYTHONPATH=. python scripts/sft/evaluate.py --output-file
+      data/experiments/predict/en-court-instruct/Unsloth-Llama-3-8B-Instruct/outputs_997.json
       --num-proc=-1
     deps:
     - hash: md5
       md5: 94924275d576271875fecf22c0f9b39e
-      path: data/experiments/predict/en-court-instruct/Unsloth-Llama-3-8B-Instruct/outputs_997.json
+      path:
+        data/experiments/predict/en-court-instruct/Unsloth-Llama-3-8B-Instruct/outputs_997.json
       size: 817490
     - hash: md5
       md5: 0644efb76af2c5461185e37a07ba2c17
@@ -205,15 +213,18 @@ stages:
     outs:
     - hash: md5
       md5: c3552161ec68d8cc6a8e5b75f02e22e2
-      path: data/experiments/predict/en-court-instruct/Unsloth-Llama-3-8B-Instruct/metrics_997.json
+      path:
+        data/experiments/predict/en-court-instruct/Unsloth-Llama-3-8B-Instruct/metrics_997.json
       size: 147
   evaluate_en@en-court-instruct-Unsloth-Llama-3-8B-Instruct-fine-tuned-en-42:
-    cmd: PYTHONPATH=. python scripts/sft/evaluate.py --output-file data/experiments/predict/en-court-instruct/Unsloth-Llama-3-8B-Instruct-fine-tuned-en/outputs_42.json
+    cmd: PYTHONPATH=. python scripts/sft/evaluate.py --output-file
+      data/experiments/predict/en-court-instruct/Unsloth-Llama-3-8B-Instruct-fine-tuned-en/outputs_42.json
       --num-proc=-1
     deps:
     - hash: md5
       md5: 4246a4fafba5e130aac3db6c1c61ce30
-      path: data/experiments/predict/en-court-instruct/Unsloth-Llama-3-8B-Instruct-fine-tuned-en/outputs_42.json
+      path:
+        data/experiments/predict/en-court-instruct/Unsloth-Llama-3-8B-Instruct-fine-tuned-en/outputs_42.json
       size: 675578
     - hash: md5
       md5: 0644efb76af2c5461185e37a07ba2c17
@@ -222,15 +233,18 @@ stages:
     outs:
     - hash: md5
       md5: 016d1c87b2925c6f941400d178bee018
-      path: data/experiments/predict/en-court-instruct/Unsloth-Llama-3-8B-Instruct-fine-tuned-en/metrics_42.json
+      path:
+        data/experiments/predict/en-court-instruct/Unsloth-Llama-3-8B-Instruct-fine-tuned-en/metrics_42.json
       size: 157
   evaluate_en@en-court-instruct-Unsloth-Llama-3-8B-Instruct-fine-tuned-en-7312:
-    cmd: PYTHONPATH=. python scripts/sft/evaluate.py --output-file data/experiments/predict/en-court-instruct/Unsloth-Llama-3-8B-Instruct-fine-tuned-en/outputs_7312.json
+    cmd: PYTHONPATH=. python scripts/sft/evaluate.py --output-file
+      data/experiments/predict/en-court-instruct/Unsloth-Llama-3-8B-Instruct-fine-tuned-en/outputs_7312.json
       --num-proc=-1
     deps:
     - hash: md5
       md5: f0b806eebca2f3ddf49d0ff821856b45
-      path: data/experiments/predict/en-court-instruct/Unsloth-Llama-3-8B-Instruct-fine-tuned-en/outputs_7312.json
+      path:
+        data/experiments/predict/en-court-instruct/Unsloth-Llama-3-8B-Instruct-fine-tuned-en/outputs_7312.json
       size: 670935
     - hash: md5
       md5: 0644efb76af2c5461185e37a07ba2c17
@@ -239,15 +253,18 @@ stages:
     outs:
     - hash: md5
       md5: a8459393feb773fea85ede4b831b3fa6
-      path: data/experiments/predict/en-court-instruct/Unsloth-Llama-3-8B-Instruct-fine-tuned-en/metrics_7312.json
+      path:
+        data/experiments/predict/en-court-instruct/Unsloth-Llama-3-8B-Instruct-fine-tuned-en/metrics_7312.json
       size: 157
   evaluate_en@en-court-instruct-Unsloth-Llama-3-8B-Instruct-fine-tuned-en-997:
-    cmd: PYTHONPATH=. python scripts/sft/evaluate.py --output-file data/experiments/predict/en-court-instruct/Unsloth-Llama-3-8B-Instruct-fine-tuned-en/outputs_997.json
+    cmd: PYTHONPATH=. python scripts/sft/evaluate.py --output-file
+      data/experiments/predict/en-court-instruct/Unsloth-Llama-3-8B-Instruct-fine-tuned-en/outputs_997.json
       --num-proc=-1
     deps:
     - hash: md5
       md5: 4e968cac351ad48ad786d1ecccbbc967
-      path: data/experiments/predict/en-court-instruct/Unsloth-Llama-3-8B-Instruct-fine-tuned-en/outputs_997.json
+      path:
+        data/experiments/predict/en-court-instruct/Unsloth-Llama-3-8B-Instruct-fine-tuned-en/outputs_997.json
       size: 670674
     - hash: md5
       md5: 0644efb76af2c5461185e37a07ba2c17
@@ -256,15 +273,18 @@ stages:
     outs:
     - hash: md5
       md5: 21bc79aad7ab2e97b75e1d3fb18a2263
-      path: data/experiments/predict/en-court-instruct/Unsloth-Llama-3-8B-Instruct-fine-tuned-en/metrics_997.json
+      path:
+        data/experiments/predict/en-court-instruct/Unsloth-Llama-3-8B-Instruct-fine-tuned-en/metrics_997.json
       size: 157
   evaluate_en@en-court-instruct-Unsloth-Mistral-Nemo-Instruct-2407-42:
-    cmd: PYTHONPATH=. python scripts/sft/evaluate.py --output-file data/experiments/predict/en-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/outputs_42.json
+    cmd: PYTHONPATH=. python scripts/sft/evaluate.py --output-file
+      data/experiments/predict/en-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/outputs_42.json
       --num-proc=-1
     deps:
     - hash: md5
       md5: 4fe25ad80a20ea5d6200136176b3e4ca
-      path: data/experiments/predict/en-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/outputs_42.json
+      path:
+        data/experiments/predict/en-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/outputs_42.json
       size: 705218
     - hash: md5
       md5: 0644efb76af2c5461185e37a07ba2c17
@@ -273,15 +293,18 @@ stages:
     outs:
     - hash: md5
       md5: 0b2f663a1cbc3ef08c363ec8adc53c15
-      path: data/experiments/predict/en-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/metrics_42.json
+      path:
+        data/experiments/predict/en-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/metrics_42.json
       size: 151
   evaluate_en@en-court-instruct-Unsloth-Mistral-Nemo-Instruct-2407-7312:
-    cmd: PYTHONPATH=. python scripts/sft/evaluate.py --output-file data/experiments/predict/en-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/outputs_7312.json
+    cmd: PYTHONPATH=. python scripts/sft/evaluate.py --output-file
+      data/experiments/predict/en-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/outputs_7312.json
       --num-proc=-1
     deps:
     - hash: md5
       md5: cf4fdbf0e26e6c793bdca4edd6e365c0
-      path: data/experiments/predict/en-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/outputs_7312.json
+      path:
+        data/experiments/predict/en-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/outputs_7312.json
       size: 703876
     - hash: md5
       md5: 0644efb76af2c5461185e37a07ba2c17
@@ -290,15 +313,18 @@ stages:
     outs:
     - hash: md5
       md5: 604b5cee14ec6520b88bafecc962e031
-      path: data/experiments/predict/en-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/metrics_7312.json
+      path:
+        data/experiments/predict/en-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/metrics_7312.json
       size: 152
   evaluate_en@en-court-instruct-Unsloth-Mistral-Nemo-Instruct-2407-997:
-    cmd: PYTHONPATH=. python scripts/sft/evaluate.py --output-file data/experiments/predict/en-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/outputs_997.json
+    cmd: PYTHONPATH=. python scripts/sft/evaluate.py --output-file
+      data/experiments/predict/en-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/outputs_997.json
       --num-proc=-1
     deps:
     - hash: md5
       md5: 94c30cf8fe7db71afc58a5c9cdbc0d9f
-      path: data/experiments/predict/en-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/outputs_997.json
+      path:
+        data/experiments/predict/en-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/outputs_997.json
       size: 705894
     - hash: md5
       md5: 0644efb76af2c5461185e37a07ba2c17
@@ -307,15 +333,18 @@ stages:
     outs:
     - hash: md5
       md5: a91ec5b434bebd8ce1d2000e0a033cb9
-      path: data/experiments/predict/en-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/metrics_997.json
+      path:
+        data/experiments/predict/en-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/metrics_997.json
       size: 152
   evaluate_en@en-court-instruct-Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned-en-42:
-    cmd: PYTHONPATH=. python scripts/sft/evaluate.py --output-file data/experiments/predict/en-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned-en/outputs_42.json
+    cmd: PYTHONPATH=. python scripts/sft/evaluate.py --output-file
+      data/experiments/predict/en-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned-en/outputs_42.json
       --num-proc=-1
     deps:
     - hash: md5
       md5: 313fa5a662f37cacae4980a04830f422
-      path: data/experiments/predict/en-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned-en/outputs_42.json
+      path:
+        data/experiments/predict/en-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned-en/outputs_42.json
       size: 642688
     - hash: md5
       md5: 0644efb76af2c5461185e37a07ba2c17
@@ -324,15 +353,18 @@ stages:
     outs:
     - hash: md5
       md5: f0d37c5ac017c0e488b7c3bed01c7093
-      path: data/experiments/predict/en-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned-en/metrics_42.json
+      path:
+        data/experiments/predict/en-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned-en/metrics_42.json
       size: 156
   evaluate_en@en-court-instruct-Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned-en-7312:
-    cmd: PYTHONPATH=. python scripts/sft/evaluate.py --output-file data/experiments/predict/en-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned-en/outputs_7312.json
+    cmd: PYTHONPATH=. python scripts/sft/evaluate.py --output-file
+      data/experiments/predict/en-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned-en/outputs_7312.json
       --num-proc=-1
     deps:
     - hash: md5
       md5: 4ed8db93aa14f1cc98e276d3989efa9e
-      path: data/experiments/predict/en-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned-en/outputs_7312.json
+      path:
+        data/experiments/predict/en-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned-en/outputs_7312.json
       size: 642730
     - hash: md5
       md5: 0644efb76af2c5461185e37a07ba2c17
@@ -341,15 +373,18 @@ stages:
     outs:
     - hash: md5
       md5: a1521ab06a56258759953bb02ae87e24
-      path: data/experiments/predict/en-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned-en/metrics_7312.json
+      path:
+        data/experiments/predict/en-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned-en/metrics_7312.json
       size: 157
   evaluate_en@en-court-instruct-Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned-en-997:
-    cmd: PYTHONPATH=. python scripts/sft/evaluate.py --output-file data/experiments/predict/en-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned-en/outputs_997.json
+    cmd: PYTHONPATH=. python scripts/sft/evaluate.py --output-file
+      data/experiments/predict/en-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned-en/outputs_997.json
       --num-proc=-1
     deps:
     - hash: md5
       md5: 787c129090aa1b64e337b236a4391402
-      path: data/experiments/predict/en-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned-en/outputs_997.json
+      path:
+        data/experiments/predict/en-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned-en/outputs_997.json
       size: 642477
     - hash: md5
       md5: 0644efb76af2c5461185e37a07ba2c17
@@ -358,7 +393,8 @@ stages:
     outs:
     - hash: md5
       md5: f3339245ea358de4b1348c8393153946
-      path: data/experiments/predict/en-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned-en/metrics_997.json
+      path:
+        data/experiments/predict/en-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned-en/metrics_997.json
       size: 157
   evaluate_llm_as_judge_api_models@en-gpt_4o_mini-open_ai_gpt-4o-997:
     cmd: PYTHONPATH=. python scripts/sft/evaluate_llm_as_judge.py api_model=gpt_4o_mini
@@ -396,7 +432,8 @@ stages:
     outs:
     - hash: md5
       md5: bd272bea099716c0c2e689a2d19c0071
-      path: data/experiments/predict/en-court-instruct/open_ai_gpt-4o-mini/judge_metrics_997.json
+      path:
+        data/experiments/predict/en-court-instruct/open_ai_gpt-4o-mini/judge_metrics_997.json
       size: 488
   evaluate_llm_as_judge_api_models@pl-gpt_4o_mini-open_ai_gpt-4o-997:
     cmd: PYTHONPATH=. python scripts/sft/evaluate_llm_as_judge.py api_model=gpt_4o_mini
@@ -434,7 +471,8 @@ stages:
     outs:
     - hash: md5
       md5: 24037233e5abe74fe13f69dd4fc5e26a
-      path: data/experiments/predict/pl-court-instruct/open_ai_gpt-4o-mini/judge_metrics_997.json
+      path:
+        data/experiments/predict/pl-court-instruct/open_ai_gpt-4o-mini/judge_metrics_997.json
       size: 1173
   evaluate_llm_as_judge_en@gpt_4o_mini-Unsloth-Llama-3-8B-Instruct-42:
     cmd: PYTHONPATH=. python scripts/sft/evaluate_llm_as_judge.py api_model=gpt_4o_mini
@@ -444,7 +482,8 @@ stages:
     deps:
     - hash: md5
       md5: 761018c0a306fbee63dad2fbc119110d
-      path: data/experiments/predict/en-court-instruct/Unsloth-Llama-3-8B-Instruct/outputs_42.json
+      path:
+        data/experiments/predict/en-court-instruct/Unsloth-Llama-3-8B-Instruct/outputs_42.json
       size: 821683
     - hash: md5
       md5: 79a02fb864cb279f93fc4171043bb31c
@@ -453,7 +492,8 @@ stages:
     outs:
     - hash: md5
       md5: 77ecbff8c82afbfd6fec098fb87e1218
-      path: data/experiments/predict/en-court-instruct/Unsloth-Llama-3-8B-Instruct/judge_metrics_42.json
+      path:
+        data/experiments/predict/en-court-instruct/Unsloth-Llama-3-8B-Instruct/judge_metrics_42.json
       size: 478
   evaluate_llm_as_judge_en@gpt_4o_mini-Unsloth-Llama-3-8B-Instruct-7312:
     cmd: PYTHONPATH=. python scripts/sft/evaluate_llm_as_judge.py api_model=gpt_4o_mini
@@ -463,7 +503,8 @@ stages:
     deps:
     - hash: md5
       md5: a7361535b440251d6ce6232a15cfcdf2
-      path: data/experiments/predict/en-court-instruct/Unsloth-Llama-3-8B-Instruct/outputs_7312.json
+      path:
+        data/experiments/predict/en-court-instruct/Unsloth-Llama-3-8B-Instruct/outputs_7312.json
       size: 818877
     - hash: md5
       md5: 79a02fb864cb279f93fc4171043bb31c
@@ -472,7 +513,8 @@ stages:
     outs:
     - hash: md5
       md5: f25c9ad98ef817e976def98d6b7d3b5d
-      path: data/experiments/predict/en-court-instruct/Unsloth-Llama-3-8B-Instruct/judge_metrics_7312.json
+      path:
+        data/experiments/predict/en-court-instruct/Unsloth-Llama-3-8B-Instruct/judge_metrics_7312.json
       size: 482
   evaluate_llm_as_judge_en@gpt_4o_mini-Unsloth-Llama-3-8B-Instruct-997:
     cmd: PYTHONPATH=. python scripts/sft/evaluate_llm_as_judge.py api_model=gpt_4o_mini
@@ -482,7 +524,8 @@ stages:
     deps:
     - hash: md5
       md5: 94924275d576271875fecf22c0f9b39e
-      path: data/experiments/predict/en-court-instruct/Unsloth-Llama-3-8B-Instruct/outputs_997.json
+      path:
+        data/experiments/predict/en-court-instruct/Unsloth-Llama-3-8B-Instruct/outputs_997.json
       size: 817490
     - hash: md5
       md5: 79a02fb864cb279f93fc4171043bb31c
@@ -491,7 +534,8 @@ stages:
     outs:
     - hash: md5
       md5: 4395c32931d25a1bd9aa092c5a0e5460
-      path: data/experiments/predict/en-court-instruct/Unsloth-Llama-3-8B-Instruct/judge_metrics_997.json
+      path:
+        data/experiments/predict/en-court-instruct/Unsloth-Llama-3-8B-Instruct/judge_metrics_997.json
       size: 478
   evaluate_llm_as_judge_en@gpt_4o_mini-Unsloth-Llama-3-8B-Instruct-fine-tuned-en-42:
     cmd: PYTHONPATH=. python scripts/sft/evaluate_llm_as_judge.py api_model=gpt_4o_mini
@@ -501,7 +545,8 @@ stages:
     deps:
     - hash: md5
       md5: 4246a4fafba5e130aac3db6c1c61ce30
-      path: data/experiments/predict/en-court-instruct/Unsloth-Llama-3-8B-Instruct-fine-tuned-en/outputs_42.json
+      path:
+        data/experiments/predict/en-court-instruct/Unsloth-Llama-3-8B-Instruct-fine-tuned-en/outputs_42.json
       size: 675578
     - hash: md5
       md5: 79a02fb864cb279f93fc4171043bb31c
@@ -510,7 +555,8 @@ stages:
     outs:
     - hash: md5
       md5: 5f2cea81c873a3b85ef95ba9a6dc90a5
-      path: data/experiments/predict/en-court-instruct/Unsloth-Llama-3-8B-Instruct-fine-tuned-en/judge_metrics_42.json
+      path:
+        data/experiments/predict/en-court-instruct/Unsloth-Llama-3-8B-Instruct-fine-tuned-en/judge_metrics_42.json
       size: 487
   evaluate_llm_as_judge_en@gpt_4o_mini-Unsloth-Llama-3-8B-Instruct-fine-tuned-en-7312:
     cmd: PYTHONPATH=. python scripts/sft/evaluate_llm_as_judge.py api_model=gpt_4o_mini
@@ -520,7 +566,8 @@ stages:
     deps:
     - hash: md5
       md5: f0b806eebca2f3ddf49d0ff821856b45
-      path: data/experiments/predict/en-court-instruct/Unsloth-Llama-3-8B-Instruct-fine-tuned-en/outputs_7312.json
+      path:
+        data/experiments/predict/en-court-instruct/Unsloth-Llama-3-8B-Instruct-fine-tuned-en/outputs_7312.json
       size: 670935
     - hash: md5
       md5: 79a02fb864cb279f93fc4171043bb31c
@@ -529,7 +576,8 @@ stages:
     outs:
     - hash: md5
       md5: 5cc45cac8a7607e42a8a394593d33396
-      path: data/experiments/predict/en-court-instruct/Unsloth-Llama-3-8B-Instruct-fine-tuned-en/judge_metrics_7312.json
+      path:
+        data/experiments/predict/en-court-instruct/Unsloth-Llama-3-8B-Instruct-fine-tuned-en/judge_metrics_7312.json
       size: 486
   evaluate_llm_as_judge_en@gpt_4o_mini-Unsloth-Llama-3-8B-Instruct-fine-tuned-en-997:
     cmd: PYTHONPATH=. python scripts/sft/evaluate_llm_as_judge.py api_model=gpt_4o_mini
@@ -539,7 +587,8 @@ stages:
     deps:
     - hash: md5
       md5: 4e968cac351ad48ad786d1ecccbbc967
-      path: data/experiments/predict/en-court-instruct/Unsloth-Llama-3-8B-Instruct-fine-tuned-en/outputs_997.json
+      path:
+        data/experiments/predict/en-court-instruct/Unsloth-Llama-3-8B-Instruct-fine-tuned-en/outputs_997.json
       size: 670674
     - hash: md5
       md5: 79a02fb864cb279f93fc4171043bb31c
@@ -548,7 +597,8 @@ stages:
     outs:
     - hash: md5
       md5: 90c2b0cd132130d0b9d3a60bf6fdd69b
-      path: data/experiments/predict/en-court-instruct/Unsloth-Llama-3-8B-Instruct-fine-tuned-en/judge_metrics_997.json
+      path:
+        data/experiments/predict/en-court-instruct/Unsloth-Llama-3-8B-Instruct-fine-tuned-en/judge_metrics_997.json
       size: 486
   evaluate_llm_as_judge_en@gpt_4o_mini-Unsloth-Mistral-Nemo-Instruct-2407-42:
     cmd: PYTHONPATH=. python scripts/sft/evaluate_llm_as_judge.py api_model=gpt_4o_mini
@@ -558,7 +608,8 @@ stages:
     deps:
     - hash: md5
       md5: 4fe25ad80a20ea5d6200136176b3e4ca
-      path: data/experiments/predict/en-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/outputs_42.json
+      path:
+        data/experiments/predict/en-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/outputs_42.json
       size: 705218
     - hash: md5
       md5: 79a02fb864cb279f93fc4171043bb31c
@@ -567,7 +618,8 @@ stages:
     outs:
     - hash: md5
       md5: 69901f631da4ffefd09e7cbfac39cd89
-      path: data/experiments/predict/en-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/judge_metrics_42.json
+      path:
+        data/experiments/predict/en-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/judge_metrics_42.json
       size: 480
   evaluate_llm_as_judge_en@gpt_4o_mini-Unsloth-Mistral-Nemo-Instruct-2407-7312:
     cmd: PYTHONPATH=. python scripts/sft/evaluate_llm_as_judge.py api_model=gpt_4o_mini
@@ -577,7 +629,8 @@ stages:
     deps:
     - hash: md5
       md5: cf4fdbf0e26e6c793bdca4edd6e365c0
-      path: data/experiments/predict/en-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/outputs_7312.json
+      path:
+        data/experiments/predict/en-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/outputs_7312.json
       size: 703876
     - hash: md5
       md5: 79a02fb864cb279f93fc4171043bb31c
@@ -586,7 +639,8 @@ stages:
     outs:
     - hash: md5
       md5: 860b5c00ace1f2967db9b5a977cfc3ad
-      path: data/experiments/predict/en-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/judge_metrics_7312.json
+      path:
+        data/experiments/predict/en-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/judge_metrics_7312.json
       size: 478
   evaluate_llm_as_judge_en@gpt_4o_mini-Unsloth-Mistral-Nemo-Instruct-2407-997:
     cmd: PYTHONPATH=. python scripts/sft/evaluate_llm_as_judge.py api_model=gpt_4o_mini
@@ -596,7 +650,8 @@ stages:
     deps:
     - hash: md5
       md5: 94c30cf8fe7db71afc58a5c9cdbc0d9f
-      path: data/experiments/predict/en-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/outputs_997.json
+      path:
+        data/experiments/predict/en-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/outputs_997.json
       size: 705894
     - hash: md5
       md5: 79a02fb864cb279f93fc4171043bb31c
@@ -605,7 +660,8 @@ stages:
     outs:
     - hash: md5
       md5: 860b5c00ace1f2967db9b5a977cfc3ad
-      path: data/experiments/predict/en-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/judge_metrics_997.json
+      path:
+        data/experiments/predict/en-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/judge_metrics_997.json
       size: 478
   evaluate_llm_as_judge_en@gpt_4o_mini-Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned-en-42:
     cmd: PYTHONPATH=. python scripts/sft/evaluate_llm_as_judge.py api_model=gpt_4o_mini
@@ -615,7 +671,8 @@ stages:
     deps:
     - hash: md5
       md5: 313fa5a662f37cacae4980a04830f422
-      path: data/experiments/predict/en-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned-en/outputs_42.json
+      path:
+        data/experiments/predict/en-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned-en/outputs_42.json
       size: 642688
     - hash: md5
       md5: 79a02fb864cb279f93fc4171043bb31c
@@ -624,7 +681,8 @@ stages:
     outs:
     - hash: md5
       md5: 974e972a09d844a77840029d642e8077
-      path: data/experiments/predict/en-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned-en/judge_metrics_42.json
+      path:
+        data/experiments/predict/en-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned-en/judge_metrics_42.json
       size: 486
   evaluate_llm_as_judge_en@gpt_4o_mini-Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned-en-7312:
     cmd: PYTHONPATH=. python scripts/sft/evaluate_llm_as_judge.py api_model=gpt_4o_mini
@@ -634,7 +692,8 @@ stages:
     deps:
     - hash: md5
       md5: 4ed8db93aa14f1cc98e276d3989efa9e
-      path: data/experiments/predict/en-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned-en/outputs_7312.json
+      path:
+        data/experiments/predict/en-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned-en/outputs_7312.json
       size: 642730
     - hash: md5
       md5: 79a02fb864cb279f93fc4171043bb31c
@@ -643,7 +702,8 @@ stages:
     outs:
     - hash: md5
       md5: 8a9712eb10a8da99d86bab8968fd3207
-      path: data/experiments/predict/en-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned-en/judge_metrics_7312.json
+      path:
+        data/experiments/predict/en-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned-en/judge_metrics_7312.json
       size: 485
   evaluate_llm_as_judge_en@gpt_4o_mini-Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned-en-997:
     cmd: PYTHONPATH=. python scripts/sft/evaluate_llm_as_judge.py api_model=gpt_4o_mini
@@ -653,7 +713,8 @@ stages:
     deps:
     - hash: md5
       md5: 787c129090aa1b64e337b236a4391402
-      path: data/experiments/predict/en-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned-en/outputs_997.json
+      path:
+        data/experiments/predict/en-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned-en/outputs_997.json
       size: 642477
     - hash: md5
       md5: 79a02fb864cb279f93fc4171043bb31c
@@ -662,7 +723,8 @@ stages:
     outs:
     - hash: md5
       md5: 34de8eabaebe6a96b4b664b664f222e2
-      path: data/experiments/predict/en-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned-en/judge_metrics_997.json
+      path:
+        data/experiments/predict/en-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned-en/judge_metrics_997.json
       size: 484
   evaluate_llm_as_judge_pl@gpt_4o_mini-Bielik-11B-v2.2-Instruct-42:
     cmd: PYTHONPATH=. python scripts/sft/evaluate_llm_as_judge.py api_model=gpt_4o_mini
@@ -672,7 +734,8 @@ stages:
     deps:
     - hash: md5
       md5: c3e404c898e3e193ac3aa910187b4f9f
-      path: data/experiments/predict/pl-court-instruct/Bielik-11B-v2.2-Instruct/outputs_42.json
+      path:
+        data/experiments/predict/pl-court-instruct/Bielik-11B-v2.2-Instruct/outputs_42.json
       size: 1734129
     - hash: md5
       md5: 79a02fb864cb279f93fc4171043bb31c
@@ -681,7 +744,8 @@ stages:
     outs:
     - hash: md5
       md5: 198f24599357bc230bf9f1e39a235a44
-      path: data/experiments/predict/pl-court-instruct/Bielik-11B-v2.2-Instruct/judge_metrics_42.json
+      path:
+        data/experiments/predict/pl-court-instruct/Bielik-11B-v2.2-Instruct/judge_metrics_42.json
       size: 1172
   evaluate_llm_as_judge_pl@gpt_4o_mini-Bielik-11B-v2.2-Instruct-7312:
     cmd: PYTHONPATH=. python scripts/sft/evaluate_llm_as_judge.py api_model=gpt_4o_mini
@@ -691,7 +755,8 @@ stages:
     deps:
     - hash: md5
       md5: d4a2ab2393a58f0d7e1897859eccb626
-      path: data/experiments/predict/pl-court-instruct/Bielik-11B-v2.2-Instruct/outputs_7312.json
+      path:
+        data/experiments/predict/pl-court-instruct/Bielik-11B-v2.2-Instruct/outputs_7312.json
       size: 1734772
     - hash: md5
       md5: 79a02fb864cb279f93fc4171043bb31c
@@ -700,7 +765,8 @@ stages:
     outs:
     - hash: md5
       md5: 81cfdaa675ef2118cf923e57cc54d201
-      path: data/experiments/predict/pl-court-instruct/Bielik-11B-v2.2-Instruct/judge_metrics_7312.json
+      path:
+        data/experiments/predict/pl-court-instruct/Bielik-11B-v2.2-Instruct/judge_metrics_7312.json
       size: 1161
   evaluate_llm_as_judge_pl@gpt_4o_mini-Bielik-11B-v2.2-Instruct-997:
     cmd: PYTHONPATH=. python scripts/sft/evaluate_llm_as_judge.py api_model=gpt_4o_mini
@@ -710,7 +776,8 @@ stages:
     deps:
     - hash: md5
       md5: 8f4f6bc97e33b3b2728bebb7620a4968
-      path: data/experiments/predict/pl-court-instruct/Bielik-11B-v2.2-Instruct/outputs_997.json
+      path:
+        data/experiments/predict/pl-court-instruct/Bielik-11B-v2.2-Instruct/outputs_997.json
       size: 1731689
     - hash: md5
       md5: 79a02fb864cb279f93fc4171043bb31c
@@ -719,7 +786,8 @@ stages:
     outs:
     - hash: md5
       md5: c5861ffaa439ba9bbd95b954d6ab1f3d
-      path: data/experiments/predict/pl-court-instruct/Bielik-11B-v2.2-Instruct/judge_metrics_997.json
+      path:
+        data/experiments/predict/pl-court-instruct/Bielik-11B-v2.2-Instruct/judge_metrics_997.json
       size: 1168
   evaluate_llm_as_judge_pl@gpt_4o_mini-Bielik-11B-v2.2-Instruct-fine-tuned-42:
     cmd: PYTHONPATH=. python scripts/sft/evaluate_llm_as_judge.py api_model=gpt_4o_mini
@@ -729,7 +797,8 @@ stages:
     deps:
     - hash: md5
       md5: dfd5d7389b312686428cc967aea5a5b9
-      path: data/experiments/predict/pl-court-instruct/Bielik-11B-v2.2-Instruct-fine-tuned/outputs_42.json
+      path:
+        data/experiments/predict/pl-court-instruct/Bielik-11B-v2.2-Instruct-fine-tuned/outputs_42.json
       size: 1860743
     - hash: md5
       md5: 79a02fb864cb279f93fc4171043bb31c
@@ -738,7 +807,8 @@ stages:
     outs:
     - hash: md5
       md5: abcd5722e84ec3e81ff8cf28b8a887cb
-      path: data/experiments/predict/pl-court-instruct/Bielik-11B-v2.2-Instruct-fine-tuned/judge_metrics_42.json
+      path:
+        data/experiments/predict/pl-court-instruct/Bielik-11B-v2.2-Instruct-fine-tuned/judge_metrics_42.json
       size: 1165
   evaluate_llm_as_judge_pl@gpt_4o_mini-Bielik-11B-v2.2-Instruct-fine-tuned-7312:
     cmd: PYTHONPATH=. python scripts/sft/evaluate_llm_as_judge.py api_model=gpt_4o_mini
@@ -748,7 +818,8 @@ stages:
     deps:
     - hash: md5
       md5: 8fa2faeda5a577c06cd6bf35b8702330
-      path: data/experiments/predict/pl-court-instruct/Bielik-11B-v2.2-Instruct-fine-tuned/outputs_7312.json
+      path:
+        data/experiments/predict/pl-court-instruct/Bielik-11B-v2.2-Instruct-fine-tuned/outputs_7312.json
       size: 1857569
     - hash: md5
       md5: 79a02fb864cb279f93fc4171043bb31c
@@ -757,7 +828,8 @@ stages:
     outs:
     - hash: md5
       md5: 4b77a3d10cd6027e7e141ba80e9678c2
-      path: data/experiments/predict/pl-court-instruct/Bielik-11B-v2.2-Instruct-fine-tuned/judge_metrics_7312.json
+      path:
+        data/experiments/predict/pl-court-instruct/Bielik-11B-v2.2-Instruct-fine-tuned/judge_metrics_7312.json
       size: 1160
   evaluate_llm_as_judge_pl@gpt_4o_mini-Bielik-11B-v2.2-Instruct-fine-tuned-997:
     cmd: PYTHONPATH=. python scripts/sft/evaluate_llm_as_judge.py api_model=gpt_4o_mini
@@ -767,7 +839,8 @@ stages:
     deps:
     - hash: md5
       md5: ba53d76f701eddb60a182de49d992878
-      path: data/experiments/predict/pl-court-instruct/Bielik-11B-v2.2-Instruct-fine-tuned/outputs_997.json
+      path:
+        data/experiments/predict/pl-court-instruct/Bielik-11B-v2.2-Instruct-fine-tuned/outputs_997.json
       size: 1857855
     - hash: md5
       md5: 79a02fb864cb279f93fc4171043bb31c
@@ -776,7 +849,8 @@ stages:
     outs:
     - hash: md5
       md5: 9e60a1ed6002a0349656c0bd23bc7b1c
-      path: data/experiments/predict/pl-court-instruct/Bielik-11B-v2.2-Instruct-fine-tuned/judge_metrics_997.json
+      path:
+        data/experiments/predict/pl-court-instruct/Bielik-11B-v2.2-Instruct-fine-tuned/judge_metrics_997.json
       size: 1164
   evaluate_llm_as_judge_pl@gpt_4o_mini-Bielik-7B-Instruct-v0.1-42:
     cmd: PYTHONPATH=. python scripts/sft/evaluate_llm_as_judge.py api_model=gpt_4o_mini
@@ -786,7 +860,8 @@ stages:
     deps:
     - hash: md5
       md5: 2dc39513a04910c5d0c54380166639d9
-      path: data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1/outputs_42.json
+      path:
+        data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1/outputs_42.json
       size: 2029644
     - hash: md5
       md5: 79a02fb864cb279f93fc4171043bb31c
@@ -795,7 +870,8 @@ stages:
     outs:
     - hash: md5
       md5: 243da4df07c6dfb5199b925e3f5c07aa
-      path: data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1/judge_metrics_42.json
+      path:
+        data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1/judge_metrics_42.json
       size: 1137
   evaluate_llm_as_judge_pl@gpt_4o_mini-Bielik-7B-Instruct-v0.1-7312:
     cmd: PYTHONPATH=. python scripts/sft/evaluate_llm_as_judge.py api_model=gpt_4o_mini
@@ -805,7 +881,8 @@ stages:
     deps:
     - hash: md5
       md5: ae39bf31296ffe82c0f6a3e8c9ff63aa
-      path: data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1/outputs_7312.json
+      path:
+        data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1/outputs_7312.json
       size: 2014399
     - hash: md5
       md5: 79a02fb864cb279f93fc4171043bb31c
@@ -814,7 +891,8 @@ stages:
     outs:
     - hash: md5
       md5: 8098cc937d57455ca47d32c3449159a3
-      path: data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1/judge_metrics_7312.json
+      path:
+        data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1/judge_metrics_7312.json
       size: 1129
   evaluate_llm_as_judge_pl@gpt_4o_mini-Bielik-7B-Instruct-v0.1-997:
     cmd: PYTHONPATH=. python scripts/sft/evaluate_llm_as_judge.py api_model=gpt_4o_mini
@@ -824,7 +902,8 @@ stages:
     deps:
     - hash: md5
       md5: fac04d78ad020b50f79fc7277a037e8e
-      path: data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1/outputs_997.json
+      path:
+        data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1/outputs_997.json
       size: 2016400
     - hash: md5
       md5: 79a02fb864cb279f93fc4171043bb31c
@@ -833,7 +912,8 @@ stages:
     outs:
     - hash: md5
       md5: f1390b2d50893a17c90fc277dc363d6a
-      path: data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1/judge_metrics_997.json
+      path:
+        data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1/judge_metrics_997.json
       size: 1139
   evaluate_llm_as_judge_pl@gpt_4o_mini-Bielik-7B-Instruct-v0.1-fine-tuned-42:
     cmd: PYTHONPATH=. python scripts/sft/evaluate_llm_as_judge.py api_model=gpt_4o_mini
@@ -843,7 +923,8 @@ stages:
     deps:
     - hash: md5
       md5: 178eb0649617d4a698da6c9e315e84c5
-      path: data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1-fine-tuned/outputs_42.json
+      path:
+        data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1-fine-tuned/outputs_42.json
       size: 2034749
     - hash: md5
       md5: 79a02fb864cb279f93fc4171043bb31c
@@ -852,7 +933,8 @@ stages:
     outs:
     - hash: md5
       md5: 302b957707520fa327d1da0edf18baa3
-      path: data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1-fine-tuned/judge_metrics_42.json
+      path:
+        data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1-fine-tuned/judge_metrics_42.json
       size: 1167
   evaluate_llm_as_judge_pl@gpt_4o_mini-Bielik-7B-Instruct-v0.1-fine-tuned-7312:
     cmd: PYTHONPATH=. python scripts/sft/evaluate_llm_as_judge.py api_model=gpt_4o_mini
@@ -862,7 +944,8 @@ stages:
     deps:
     - hash: md5
       md5: 743ea22448bc73a7a991da075fca8841
-      path: data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1-fine-tuned/outputs_7312.json
+      path:
+        data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1-fine-tuned/outputs_7312.json
       size: 2031343
     - hash: md5
       md5: 79a02fb864cb279f93fc4171043bb31c
@@ -871,7 +954,8 @@ stages:
     outs:
     - hash: md5
       md5: 789f0906846251d3f0cab78d111f9c56
-      path: data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1-fine-tuned/judge_metrics_7312.json
+      path:
+        data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1-fine-tuned/judge_metrics_7312.json
       size: 1163
   evaluate_llm_as_judge_pl@gpt_4o_mini-Bielik-7B-Instruct-v0.1-fine-tuned-997:
     cmd: PYTHONPATH=. python scripts/sft/evaluate_llm_as_judge.py api_model=gpt_4o_mini
@@ -881,7 +965,8 @@ stages:
     deps:
     - hash: md5
       md5: 433a4b2aa7870a134277a265d099a588
-      path: data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1-fine-tuned/outputs_997.json
+      path:
+        data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1-fine-tuned/outputs_997.json
       size: 2029482
     - hash: md5
       md5: 79a02fb864cb279f93fc4171043bb31c
@@ -890,7 +975,8 @@ stages:
     outs:
     - hash: md5
       md5: 90f3ed04ef29c5cd29b7ec8f02a780a1
-      path: data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1-fine-tuned/judge_metrics_997.json
+      path:
+        data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1-fine-tuned/judge_metrics_997.json
       size: 1163
   evaluate_llm_as_judge_pl@gpt_4o_mini-Unsloth-Llama-3-8B-Instruct-42:
     cmd: PYTHONPATH=. python scripts/sft/evaluate_llm_as_judge.py api_model=gpt_4o_mini
@@ -900,7 +986,8 @@ stages:
     deps:
     - hash: md5
       md5: e99c88720116c951087b6125e5f4be4d
-      path: data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct/outputs_42.json
+      path:
+        data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct/outputs_42.json
       size: 2008073
     - hash: md5
       md5: 79a02fb864cb279f93fc4171043bb31c
@@ -909,7 +996,8 @@ stages:
     outs:
     - hash: md5
       md5: 9d9fba0cf2169e9dd9f69579a2182b8e
-      path: data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct/judge_metrics_42.json
+      path:
+        data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct/judge_metrics_42.json
       size: 1172
   evaluate_llm_as_judge_pl@gpt_4o_mini-Unsloth-Llama-3-8B-Instruct-7312:
     cmd: PYTHONPATH=. python scripts/sft/evaluate_llm_as_judge.py api_model=gpt_4o_mini
@@ -919,7 +1007,8 @@ stages:
     deps:
     - hash: md5
       md5: 4c25368aacb7402b1b2cae9368d187d1
-      path: data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct/outputs_7312.json
+      path:
+        data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct/outputs_7312.json
       size: 2013637
     - hash: md5
       md5: 79a02fb864cb279f93fc4171043bb31c
@@ -928,7 +1017,8 @@ stages:
     outs:
     - hash: md5
       md5: e58171fc082d33c84497a13dabcf766c
-      path: data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct/judge_metrics_7312.json
+      path:
+        data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct/judge_metrics_7312.json
       size: 1167
   evaluate_llm_as_judge_pl@gpt_4o_mini-Unsloth-Llama-3-8B-Instruct-997:
     cmd: PYTHONPATH=. python scripts/sft/evaluate_llm_as_judge.py api_model=gpt_4o_mini
@@ -938,7 +1028,8 @@ stages:
     deps:
     - hash: md5
       md5: baef589507248af212aaae51602fd999
-      path: data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct/outputs_997.json
+      path:
+        data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct/outputs_997.json
       size: 2010150
     - hash: md5
       md5: 79a02fb864cb279f93fc4171043bb31c
@@ -947,7 +1038,8 @@ stages:
     outs:
     - hash: md5
       md5: f8d16a5298fabe288486822779470cd8
-      path: data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct/judge_metrics_997.json
+      path:
+        data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct/judge_metrics_997.json
       size: 1165
   evaluate_llm_as_judge_pl@gpt_4o_mini-Unsloth-Llama-3-8B-Instruct-fine-tuned-42:
     cmd: PYTHONPATH=. python scripts/sft/evaluate_llm_as_judge.py api_model=gpt_4o_mini
@@ -957,7 +1049,8 @@ stages:
     deps:
     - hash: md5
       md5: 289b719e8c7166e578417e5706bdc4e3
-      path: data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct-fine-tuned/outputs_42.json
+      path:
+        data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct-fine-tuned/outputs_42.json
       size: 1760355
     - hash: md5
       md5: 79a02fb864cb279f93fc4171043bb31c
@@ -966,7 +1059,8 @@ stages:
     outs:
     - hash: md5
       md5: 70398042d030309e7e0bc7ba927136f3
-      path: data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct-fine-tuned/judge_metrics_42.json
+      path:
+        data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct-fine-tuned/judge_metrics_42.json
       size: 1167
   evaluate_llm_as_judge_pl@gpt_4o_mini-Unsloth-Llama-3-8B-Instruct-fine-tuned-7312:
     cmd: PYTHONPATH=. python scripts/sft/evaluate_llm_as_judge.py api_model=gpt_4o_mini
@@ -976,7 +1070,8 @@ stages:
     deps:
     - hash: md5
       md5: 25bee3b4ee09b36d636095b4c927a0d3
-      path: data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct-fine-tuned/outputs_7312.json
+      path:
+        data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct-fine-tuned/outputs_7312.json
       size: 1759194
     - hash: md5
       md5: 79a02fb864cb279f93fc4171043bb31c
@@ -985,7 +1080,8 @@ stages:
     outs:
     - hash: md5
       md5: 9d22089c8d23bbc5a028c748e5522c23
-      path: data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct-fine-tuned/judge_metrics_7312.json
+      path:
+        data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct-fine-tuned/judge_metrics_7312.json
       size: 1157
   evaluate_llm_as_judge_pl@gpt_4o_mini-Unsloth-Llama-3-8B-Instruct-fine-tuned-997:
     cmd: PYTHONPATH=. python scripts/sft/evaluate_llm_as_judge.py api_model=gpt_4o_mini
@@ -995,7 +1091,8 @@ stages:
     deps:
     - hash: md5
       md5: 82b2c535d99d91b9a34986375bfa31a9
-      path: data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct-fine-tuned/outputs_997.json
+      path:
+        data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct-fine-tuned/outputs_997.json
       size: 1758747
     - hash: md5
       md5: 79a02fb864cb279f93fc4171043bb31c
@@ -1004,7 +1101,8 @@ stages:
     outs:
     - hash: md5
       md5: 4222d5b165de8a3a89d71d6519b71b76
-      path: data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct-fine-tuned/judge_metrics_997.json
+      path:
+        data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct-fine-tuned/judge_metrics_997.json
       size: 1170
   evaluate_llm_as_judge_pl@gpt_4o_mini-Unsloth-Mistral-Nemo-Instruct-2407-42:
     cmd: PYTHONPATH=. python scripts/sft/evaluate_llm_as_judge.py api_model=gpt_4o_mini
@@ -1014,7 +1112,8 @@ stages:
     deps:
     - hash: md5
       md5: 1385f49966e9db2a88a17f53d0887ad8
-      path: data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/outputs_42.json
+      path:
+        data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/outputs_42.json
       size: 1741944
     - hash: md5
       md5: 79a02fb864cb279f93fc4171043bb31c
@@ -1023,7 +1122,8 @@ stages:
     outs:
     - hash: md5
       md5: f4bac633a65afde9bf5612f35c3089bb
-      path: data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/judge_metrics_42.json
+      path:
+        data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/judge_metrics_42.json
       size: 1170
   evaluate_llm_as_judge_pl@gpt_4o_mini-Unsloth-Mistral-Nemo-Instruct-2407-7312:
     cmd: PYTHONPATH=. python scripts/sft/evaluate_llm_as_judge.py api_model=gpt_4o_mini
@@ -1033,7 +1133,8 @@ stages:
     deps:
     - hash: md5
       md5: 924744efce1483e9128579cad7a4454c
-      path: data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/outputs_7312.json
+      path:
+        data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/outputs_7312.json
       size: 1748772
     - hash: md5
       md5: 79a02fb864cb279f93fc4171043bb31c
@@ -1042,7 +1143,8 @@ stages:
     outs:
     - hash: md5
       md5: 1f95777ef87a547fa7a41dc597adfc39
-      path: data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/judge_metrics_7312.json
+      path:
+        data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/judge_metrics_7312.json
       size: 1166
   evaluate_llm_as_judge_pl@gpt_4o_mini-Unsloth-Mistral-Nemo-Instruct-2407-997:
     cmd: PYTHONPATH=. python scripts/sft/evaluate_llm_as_judge.py api_model=gpt_4o_mini
@@ -1052,7 +1154,8 @@ stages:
     deps:
     - hash: md5
       md5: 4d023797a9053fd7df61f6b1796112e9
-      path: data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/outputs_997.json
+      path:
+        data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/outputs_997.json
       size: 1747404
     - hash: md5
       md5: 79a02fb864cb279f93fc4171043bb31c
@@ -1061,7 +1164,8 @@ stages:
     outs:
     - hash: md5
       md5: de3f557dfdf3440262e4d8f811e526ca
-      path: data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/judge_metrics_997.json
+      path:
+        data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/judge_metrics_997.json
       size: 1167
   evaluate_llm_as_judge_pl@gpt_4o_mini-Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned-42:
     cmd: PYTHONPATH=. python scripts/sft/evaluate_llm_as_judge.py api_model=gpt_4o_mini
@@ -1071,7 +1175,8 @@ stages:
     deps:
     - hash: md5
       md5: 14d4613f7d9495f5fb5f2d7b81f402a9
-      path: data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned/outputs_42.json
+      path:
+        data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned/outputs_42.json
       size: 1825646
     - hash: md5
       md5: 79a02fb864cb279f93fc4171043bb31c
@@ -1080,7 +1185,8 @@ stages:
     outs:
     - hash: md5
       md5: e8cff190991ee3164825dbf7eca03d12
-      path: data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned/judge_metrics_42.json
+      path:
+        data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned/judge_metrics_42.json
       size: 1170
   evaluate_llm_as_judge_pl@gpt_4o_mini-Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned-7312:
     cmd: PYTHONPATH=. python scripts/sft/evaluate_llm_as_judge.py api_model=gpt_4o_mini
@@ -1090,7 +1196,8 @@ stages:
     deps:
     - hash: md5
       md5: 302e1dc4f064007e3df88ac1e8acccc5
-      path: data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned/outputs_7312.json
+      path:
+        data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned/outputs_7312.json
       size: 1831330
     - hash: md5
       md5: 79a02fb864cb279f93fc4171043bb31c
@@ -1099,7 +1206,8 @@ stages:
     outs:
     - hash: md5
       md5: aee4a08e0a4d0398b34a2587c039244d
-      path: data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned/judge_metrics_7312.json
+      path:
+        data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned/judge_metrics_7312.json
       size: 1169
   evaluate_llm_as_judge_pl@gpt_4o_mini-Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned-997:
     cmd: PYTHONPATH=. python scripts/sft/evaluate_llm_as_judge.py api_model=gpt_4o_mini
@@ -1109,7 +1217,8 @@ stages:
     deps:
     - hash: md5
       md5: 41a47dc56efc29b6c2771db68bdacb17
-      path: data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned/outputs_997.json
+      path:
+        data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned/outputs_997.json
       size: 1822491
     - hash: md5
       md5: 79a02fb864cb279f93fc4171043bb31c
@@ -1118,231 +1227,12 @@ stages:
     outs:
     - hash: md5
       md5: aac703269b10c85d1a2b5303c22ca077
-      path: data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned/judge_metrics_997.json
+      path:
+        data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned/judge_metrics_997.json
       size: 1168
   evaluate_pl@pl-court-instruct-Bielik-11B-v2.2-Instruct-42:
-    cmd: PYTHONPATH=. python scripts/sft/evaluate.py --output-file data/experiments/predict/pl-court-instruct/Bielik-11B-v2.2-Instruct/outputs_42.json
-      --num-proc=-1
-    deps:
-    - hash: md5
-      md5: c3e404c898e3e193ac3aa910187b4f9f
-      path: data/experiments/predict/pl-court-instruct/Bielik-11B-v2.2-Instruct/outputs_42.json
-      size: 1734129
-    - hash: md5
-      md5: 0644efb76af2c5461185e37a07ba2c17
-      path: scripts/sft/evaluate.py
-      size: 697
-    outs:
-    - hash: md5
-      md5: a75ab0f8f8238ab8c86397dd015fd31d
-      path: data/experiments/predict/pl-court-instruct/Bielik-11B-v2.2-Instruct/metrics_42.json
-      size: 306
-  evaluate_pl@pl-court-instruct-Bielik-11B-v2.2-Instruct-7312:
-    cmd: PYTHONPATH=. python scripts/sft/evaluate.py --output-file data/experiments/predict/pl-court-instruct/Bielik-11B-v2.2-Instruct/outputs_7312.json
-      --num-proc=-1
-    deps:
-    - hash: md5
-      md5: d4a2ab2393a58f0d7e1897859eccb626
-      path: data/experiments/predict/pl-court-instruct/Bielik-11B-v2.2-Instruct/outputs_7312.json
-      size: 1734772
-    - hash: md5
-      md5: 0644efb76af2c5461185e37a07ba2c17
-      path: scripts/sft/evaluate.py
-      size: 697
-    outs:
-    - hash: md5
-      md5: d5861dc30fca8f9bd2d311d924b3905d
-      path: data/experiments/predict/pl-court-instruct/Bielik-11B-v2.2-Instruct/metrics_7312.json
-      size: 305
-  evaluate_pl@pl-court-instruct-Bielik-11B-v2.2-Instruct-997:
-    cmd: PYTHONPATH=. python scripts/sft/evaluate.py --output-file data/experiments/predict/pl-court-instruct/Bielik-11B-v2.2-Instruct/outputs_997.json
-      --num-proc=-1
-    deps:
-    - hash: md5
-      md5: 8f4f6bc97e33b3b2728bebb7620a4968
-      path: data/experiments/predict/pl-court-instruct/Bielik-11B-v2.2-Instruct/outputs_997.json
-      size: 1731689
-    - hash: md5
-      md5: 0644efb76af2c5461185e37a07ba2c17
-      path: scripts/sft/evaluate.py
-      size: 697
-    outs:
-    - hash: md5
-      md5: cd6699727392af2d61383b05fa962741
-      path: data/experiments/predict/pl-court-instruct/Bielik-11B-v2.2-Instruct/metrics_997.json
-      size: 306
-  evaluate_pl@pl-court-instruct-Bielik-11B-v2.2-Instruct-fine-tuned-42:
-    cmd: PYTHONPATH=. python scripts/sft/evaluate.py --output-file data/experiments/predict/pl-court-instruct/Bielik-11B-v2.2-Instruct-fine-tuned/outputs_42.json
-      --num-proc=-1
-    deps:
-    - hash: md5
-      md5: dfd5d7389b312686428cc967aea5a5b9
-      path: data/experiments/predict/pl-court-instruct/Bielik-11B-v2.2-Instruct-fine-tuned/outputs_42.json
-      size: 1860743
-    - hash: md5
-      md5: 0644efb76af2c5461185e37a07ba2c17
-      path: scripts/sft/evaluate.py
-      size: 697
-    outs:
-    - hash: md5
-      md5: d1462bb74d1f8790270a5d97c674891c
-      path: data/experiments/predict/pl-court-instruct/Bielik-11B-v2.2-Instruct-fine-tuned/metrics_42.json
-      size: 304
-  evaluate_pl@pl-court-instruct-Bielik-11B-v2.2-Instruct-fine-tuned-7312:
-    cmd: PYTHONPATH=. python scripts/sft/evaluate.py --output-file data/experiments/predict/pl-court-instruct/Bielik-11B-v2.2-Instruct-fine-tuned/outputs_7312.json
-      --num-proc=-1
-    deps:
-    - hash: md5
-      md5: 8fa2faeda5a577c06cd6bf35b8702330
-      path: data/experiments/predict/pl-court-instruct/Bielik-11B-v2.2-Instruct-fine-tuned/outputs_7312.json
-      size: 1857569
-    - hash: md5
-      md5: 0644efb76af2c5461185e37a07ba2c17
-      path: scripts/sft/evaluate.py
-      size: 697
-    outs:
-    - hash: md5
-      md5: 5edacea1e40b97765c7eaa7b4991ab16
-      path: data/experiments/predict/pl-court-instruct/Bielik-11B-v2.2-Instruct-fine-tuned/metrics_7312.json
-      size: 306
-  evaluate_pl@pl-court-instruct-Bielik-11B-v2.2-Instruct-fine-tuned-997:
-    cmd: PYTHONPATH=. python scripts/sft/evaluate.py --output-file data/experiments/predict/pl-court-instruct/Bielik-11B-v2.2-Instruct-fine-tuned/outputs_997.json
-      --num-proc=-1
-    deps:
-    - hash: md5
-      md5: ba53d76f701eddb60a182de49d992878
-      path: data/experiments/predict/pl-court-instruct/Bielik-11B-v2.2-Instruct-fine-tuned/outputs_997.json
-      size: 1857855
-    - hash: md5
-      md5: 0644efb76af2c5461185e37a07ba2c17
-      path: scripts/sft/evaluate.py
-      size: 697
-    outs:
-    - hash: md5
-      md5: 84fbcf83da746f9e98f70ab22be6f238
-      path: data/experiments/predict/pl-court-instruct/Bielik-11B-v2.2-Instruct-fine-tuned/metrics_997.json
-      size: 304
-  evaluate_pl@pl-court-instruct-Bielik-7B-Instruct-v0.1-42:
-    cmd: PYTHONPATH=. python scripts/sft/evaluate.py --output-file data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1/outputs_42.json
-      --num-proc=-1
-    deps:
-    - hash: md5
-      md5: 2dc39513a04910c5d0c54380166639d9
-      path: data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1/outputs_42.json
-      size: 2029644
-    - hash: md5
-      md5: 0644efb76af2c5461185e37a07ba2c17
-      path: scripts/sft/evaluate.py
-      size: 697
-    outs:
-    - hash: md5
-      md5: 2cbca38fd0bbdb4df024f76506eeb26c
-      path: data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1/metrics_42.json
-      size: 307
-  evaluate_pl@pl-court-instruct-Bielik-7B-Instruct-v0.1-7312:
-    cmd: PYTHONPATH=. python scripts/sft/evaluate.py --output-file data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1/outputs_7312.json
-      --num-proc=-1
-    deps:
-    - hash: md5
-      md5: ae39bf31296ffe82c0f6a3e8c9ff63aa
-      path: data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1/outputs_7312.json
-      size: 2014399
-    - hash: md5
-      md5: 0644efb76af2c5461185e37a07ba2c17
-      path: scripts/sft/evaluate.py
-      size: 697
-    outs:
-    - hash: md5
-      md5: ad13d47ca88e721be75c79c225e12ee6
-      path: data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1/metrics_7312.json
-      size: 289
-  evaluate_pl@pl-court-instruct-Bielik-7B-Instruct-v0.1-997:
-    cmd: PYTHONPATH=. python scripts/sft/evaluate.py --output-file data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1/outputs_997.json
-      --num-proc=-1
-    deps:
-    - hash: md5
-      md5: fac04d78ad020b50f79fc7277a037e8e
-      path: data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1/outputs_997.json
-      size: 2016400
-    - hash: md5
-      md5: 0644efb76af2c5461185e37a07ba2c17
-      path: scripts/sft/evaluate.py
-      size: 697
-    outs:
-    - hash: md5
-      md5: 83fb160145ef5e21b43f7c348658ea02
-      path: data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1/metrics_997.json
-      size: 327
-  evaluate_pl@pl-court-instruct-Bielik-7B-Instruct-v0.1-fine-tuned-42:
-    cmd: PYTHONPATH=. python scripts/sft/evaluate.py --output-file data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1-fine-tuned/outputs_42.json
-      --num-proc=-1
-    deps:
-    - hash: md5
-      md5: 178eb0649617d4a698da6c9e315e84c5
-      path: data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1-fine-tuned/outputs_42.json
-      size: 2034749
-    - hash: md5
-      md5: 0644efb76af2c5461185e37a07ba2c17
-      path: scripts/sft/evaluate.py
-      size: 697
-    outs:
-    - hash: md5
-      md5: 99e684c720ca4c4ef6c4276e7d1880ab
-      path: data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1-fine-tuned/metrics_42.json
-      size: 305
-  evaluate_pl@pl-court-instruct-Bielik-7B-Instruct-v0.1-fine-tuned-7312:
-    cmd: PYTHONPATH=. python scripts/sft/evaluate.py --output-file data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1-fine-tuned/outputs_7312.json
-      --num-proc=-1
-    deps:
-    - hash: md5
-      md5: 743ea22448bc73a7a991da075fca8841
-      path: data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1-fine-tuned/outputs_7312.json
-      size: 2031343
-    - hash: md5
-      md5: 0644efb76af2c5461185e37a07ba2c17
-      path: scripts/sft/evaluate.py
-      size: 697
-    outs:
-    - hash: md5
-      md5: 07d798079cedf3dc194242d6a1bc3bcd
-      path: data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1-fine-tuned/metrics_7312.json
-      size: 306
-  evaluate_pl@pl-court-instruct-Bielik-7B-Instruct-v0.1-fine-tuned-997:
-    cmd: PYTHONPATH=. python scripts/sft/evaluate.py --output-file data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1-fine-tuned/outputs_997.json
-      --num-proc=-1
-    deps:
-    - hash: md5
-      md5: 433a4b2aa7870a134277a265d099a588
-      path: data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1-fine-tuned/outputs_997.json
-      size: 2029482
-    - hash: md5
-      md5: 0644efb76af2c5461185e37a07ba2c17
-      path: scripts/sft/evaluate.py
-      size: 697
-    outs:
-    - hash: md5
-      md5: adb7c1e239396bbf6e308f3f1b436099
-      path: data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1-fine-tuned/metrics_997.json
-      size: 307
-  evaluate_pl@pl-court-instruct-Unsloth-Llama-3-8B-Instruct-42:
-    cmd: PYTHONPATH=. python scripts/sft/evaluate.py --output-file data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct/outputs_42.json
-      --num-proc=-1
-    deps:
-    - hash: md5
-      md5: e99c88720116c951087b6125e5f4be4d
-      path: data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct/outputs_42.json
-      size: 2008073
-    - hash: md5
-      md5: 0644efb76af2c5461185e37a07ba2c17
-      path: scripts/sft/evaluate.py
-      size: 697
-    outs:
-    - hash: md5
-      md5: 2116481b79c785f94b35852b6e0e4f57
-      path: data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct/metrics_42.json
-      size: 304
-  evaluate_pl@pl-court-instruct-Unsloth-Llama-3-8B-Instruct-7312:
-    cmd: PYTHONPATH=. python scripts/sft/evaluate.py --output-file data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct/outputs_7312.json
+    cmd: PYTHONPATH=. python scripts/sft/evaluate.py --output-file
+      data/experiments/predict/pl-court-instruct/Bielik-11B-v2.2-Instruct/outputs_42.json
       --num-proc=-1
     deps:
     - hash: md5
diff --git a/juddges/_modidx.py b/juddges/_modidx.py
index 4e513c7..c3de275 100644
--- a/juddges/_modidx.py
+++ b/juddges/_modidx.py
@@ -11,6 +11,7 @@
             'juddges.data.datasets.utils': {},
             'juddges.data.pl_court_api': {},
             'juddges.data.pl_court_graph': {},
+            'juddges.data.weaviate_db': {},
             'juddges.evaluation.eval_full_text': {},
             'juddges.evaluation.eval_structured': {},
             'juddges.evaluation.eval_structured_llm_judge': {},
diff --git a/juddges/data/weaviate_db.py b/juddges/data/weaviate_db.py
new file mode 100644
index 0000000..e1dd412
--- /dev/null
+++ b/juddges/data/weaviate_db.py
@@ -0,0 +1,112 @@
+import re
+from abc import ABC, abstractmethod
+from typing import Any, ClassVar
+
+import weaviate
+import weaviate.classes.config as wvcc
+from weaviate.auth import Auth, _APIKey
+
+
+class WeaviateDatabase(ABC):
+    def __init__(self, host: str, port: str, grpc_port: str, api_key: str | None):
+        self.host = host
+        self.port = port
+        self.grpc_port = grpc_port
+        self.__api_key = api_key
+
+        self.client: weaviate.WeaviateClient
+
+    def __enter__(self) -> "WeaviateDatabase":
+        self.client = weaviate.connect_to_local(
+            host=self.host,
+            port=self.port,
+            grpc_port=self.grpc_port,
+            auth_credentials=self.api_key,
+        )
+        self.create_collections()
+        return self
+
+    def __exit__(self, exc_type, exc_value, traceback) -> None:
+        if hasattr(self, "client"):
+            self.client.close()
+
+    def __del__(self) -> None:
+        self.__exit__(None, None, None)
+
+    @property
+    def api_key(self) -> _APIKey | None:
+        if self.__api_key is not None:
+            return Auth.api_key(self.__api_key)
+        return None
+
+    @abstractmethod
+    def create_collections(self) -> None:
+        pass
+
+    def insert_batch(
+        self,
+        collection: weaviate.collections.Collection,
+        objects: list[dict[str, Any]],
+    ) -> None:
+        with collection.batch.dynamic() as wv_batch:
+            for obj in objects:
+                wv_batch.add_object(**obj)
+                if wv_batch.number_errors > 0:
+                    break
+            if wv_batch.number_errors > 0:
+                errors = [err.message for err in collection.batch.results.objs.errors.values()]
+                raise ValueError(f"Error ingesting batch: {errors}")
+
+    def get_uuids(self, collection: weaviate.collections.Collection) -> list[str]:
+        return [str(obj.uuid) for obj in collection.iterator(return_properties=[])]
+
+    def _safe_create_collection(self, *args: Any, **kwargs: Any) -> None:
+        try:
+            self.client.collections.create(*args, **kwargs)
+        except weaviate.exceptions.UnexpectedStatusCodeError as err:
+            if (
+                re.search(r"class name (\w+?) already exists", err.message)
+                and err.status_code == 422
+            ):
+                pass
+            else:
+                raise
+
+
+class WeaviateJudgementsDatabase(WeaviateDatabase):
+    JUDGMENTS_COLLECTION: ClassVar[str] = "judgements"
+    JUDGMENT_CHUNKS_COLLECTION: ClassVar[str] = "judgement_chunks"
+
+    @property
+    def judgements_collection(self) -> weaviate.collections.Collection:
+        return self.client.collections.get(self.JUDGMENTS_COLLECTION)
+
+    @property
+    def judgement_chunks_collection(self) -> weaviate.collections.Collection:
+        return self.client.collections.get(self.JUDGMENT_CHUNKS_COLLECTION)
+
+    def create_collections(self) -> None:
+        self._safe_create_collection(
+            name=self.JUDGMENTS_COLLECTION,
+            properties=[
+                wvcc.Property(name="judgement_id", data_type=wvcc.DataType.TEXT),
+            ],
+        )
+        self._safe_create_collection(
+            name=self.JUDGMENT_CHUNKS_COLLECTION,
+            properties=[
+                wvcc.Property(name="chunk_id", data_type=wvcc.DataType.INT),
+                wvcc.Property(name="chunk_text", data_type=wvcc.DataType.TEXT),
+            ],
+            vectorizer_config=wvcc.Configure.Vectorizer.text2vec_transformers(),
+            references=[
+                wvcc.ReferenceProperty(
+                    name="judgementChunk",
+                    target_collection=self.JUDGMENTS_COLLECTION,
+                )
+            ],
+        )
+
+    @staticmethod
+    def uuid_from_judgement_chunk_id(judgement_id: str, chunk_id: int) -> str:
+        return weaviate.util.generate_uuid5(f"{judgement_id}_chunk_{chunk_id}")
diff --git a/juddges/preprocessing/text_chunker.py b/juddges/preprocessing/text_chunker.py
index 25ec80f..c7fb74e 100644
--- a/juddges/preprocessing/text_chunker.py
+++ b/juddges/preprocessing/text_chunker.py
@@ -8,6 +8,7 @@ class TextSplitter:
     def __init__(
         self,
         chunk_size: int,
+        chunk_overlap: int | None = None,
         min_split_chars: int | None = None,
         take_n_first_chunks: int | None = None,
         tokenizer: PreTrainedTokenizer | None = None,
@@ -16,6 +17,7 @@ def __init__(
             self.splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
                 tokenizer,
                 chunk_size=chunk_size,
+                chunk_overlap=chunk_overlap,
             )
         else:
             self.splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size)
diff --git a/requirements.txt b/requirements.txt
index c1ba949..48fc5e4 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -33,6 +33,7 @@ transformers==4.42.3
 trl==0.9.4
 typer==0.9.0
 wandb==0.16.5
+weaviate-client==4.8.1
 xmltodict==0.13.0
 xlsxwriter==3.2.0
 
diff --git a/scripts/embed/embed_text.py b/scripts/embed/embed_text.py
index 54bba5c..6c24f11 100644
--- a/scripts/embed/embed_text.py
+++ b/scripts/embed/embed_text.py
@@ -10,6 +10,7 @@
 from omegaconf import DictConfig
 from openai import BaseModel
 from sentence_transformers import SentenceTransformer
+from transformers import PreTrainedTokenizer
 from transformers.utils import is_flash_attn_2_available
 
 from juddges.config import EmbeddingModelConfig, RawDatasetConfig
@@ -21,6 +22,7 @@
 
 NUM_PROC = int(os.getenv("NUM_PROC", 1))
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+os.environ["TOKENIZERS_PARALLELISM"] = "false" if (NUM_PROC > 1) else "true"
 
 
 class EmbeddingConfig(BaseModel, extra="forbid"):
@@ -51,12 +53,6 @@ def main(cfg: DictConfig) -> None:
     )
     ds = ds.filter(lambda item: item["text"] is not None)
 
-    if config.chunk_config is not None:
-        ds = chunk_dataset(ds, config)
-        text_column = "text_chunk"
-    else:
-        text_column = "text"
-
     model = SentenceTransformer(
         config.embedding_model.name,
         device=DEVICE,
@@ -64,6 +60,12 @@ def main(cfg: DictConfig) -> None:
     )
     model.compile()
 
+    if config.chunk_config is not None:
+        ds = chunk_dataset(dataset=ds, config=config, tokenizer=model.tokenizer)
+        text_column = "text_chunk"
+    else:
+        text_column = "text"
+
     if config.truncation_tokens is not None:
         assert config.truncation_tokens <= config.embedding_model.max_seq_length
         model.max_seq_length = config.truncation_tokens
@@ -74,19 +76,22 @@ def main(cfg: DictConfig) -> None:
         batched=True,
         batch_size=config.batch_size,
         num_proc=None,
-        remove_columns=[text_column],
         desc="Embedding chunks",
     )
-    ds.save_to_disk(config.output_dir)
+    ds.save_to_disk(str(config.output_dir))
 
     with open(config.output_dir / "config.yaml", "w") as f:
         yaml.dump(config.model_dump(), f)
 
 
-def chunk_dataset(dataset: Dataset, config: EmbeddingConfig) -> Dataset:
+def chunk_dataset(
+    dataset: Dataset,
+    config: EmbeddingConfig,
+    tokenizer: PreTrainedTokenizer | None = None,
+) -> Dataset:
     # todo: To be verified
     assert config.chunk_config is not None
-    split_worker = TextSplitter(**config.chunk_config)
+    split_worker = TextSplitter(**config.chunk_config, tokenizer=tokenizer)
     ds = dataset.select_columns(["_id", "text"]).map(
         split_worker,
         batched=True,
diff --git a/scripts/embed/ingest.py b/scripts/embed/ingest_mongodb.py
similarity index 100%
rename from scripts/embed/ingest.py
rename to scripts/embed/ingest_mongodb.py
diff --git a/scripts/embed/ingest_weaviate.py b/scripts/embed/ingest_weaviate.py
new file mode 100644
index 0000000..99d1a0d
--- /dev/null
+++ b/scripts/embed/ingest_weaviate.py
@@ -0,0 +1,79 @@
+import math
+import os
+from pathlib import Path
+
+import typer
+from datasets import load_dataset
+from dotenv import load_dotenv
+from loguru import logger
+from tqdm.auto import tqdm
+
+from juddges.data.weaviate_db import WeaviateJudgementsDatabase
+from weaviate.util import generate_uuid5
+
+load_dotenv()
+WV_HOST = os.getenv("WV_HOST", "localhost")
+WV_PORT = os.getenv("WV_PORT", "8080")
+WV_GRPC_PORT = os.getenv("WV_GRPC_PORT", "50051")
+WV_API_KEY = os.getenv("WV_API_KEY", None)
+
+BATCH_SIZE = 64
+NUM_PROC = int(os.getenv("NUM_PROC", 1))
+
+logger.info(f"Connecting to Weaviate at {WV_HOST}:{WV_PORT} (gRPC: {WV_GRPC_PORT})")
+
+
+def main(
+    embeddings_dir: Path = typer.Option(...),
+    batch_size: int = typer.Option(BATCH_SIZE),
+    upsert: bool = typer.Option(False),
+) -> None:
+    logger.warning(
+        "The script will upload local embeddings to the database, "
+        "make sure they are the same as in the inference module of the database."
+    )
+    embs = load_dataset(str(embeddings_dir))["train"]
+    embs = embs.map(
+        lambda item: {
+            "uuid": WeaviateJudgementsDatabase.uuid_from_judgement_chunk_id(
+                judgement_id=item["_id"], chunk_id=item["chunk_id"]
+            )
+        },
+        num_proc=NUM_PROC,
+        desc="Generating UUIDs",
+    )
+    with WeaviateJudgementsDatabase(WV_HOST, WV_PORT, WV_GRPC_PORT, WV_API_KEY) as db:
+        if not upsert:
+            logger.info("upsert disabled - uploading only new embeddings")
+            uuids = set(db.get_uuids(db.judgement_chunks_collection))
+            embs = embs.filter(lambda item: item["uuid"] not in uuids)
+        else:
+            logger.info(
+                "upsert enabled - uploading all embeddings (automatically updating already uploaded)"
+            )
+
+        for batch in tqdm(
+            embs.iter(batch_size=batch_size),
+            total=math.ceil(len(embs) / batch_size),
+            desc="Uploading batches",
+        ):
+            objects = [
+                {
+                    "properties": {
+                        "judgment_id": batch["_id"][i],
+                        "chunk_id": batch["chunk_id"][i],
+                        "chunk_text": batch["text_chunk"][i],
+                    },
+                    "uuid": generate_uuid5(f"{batch['_id'][i]}_chunk_{batch['chunk_id'][i]}"),
+                    "vector": batch["embedding"][i],
+                }
+                for i in range(len(batch["_id"]))
+            ]
+            db.insert_batch(
+                collection=db.judgement_chunks_collection,
+                objects=objects,
+            )
+
+
+if __name__ == "__main__":
+    typer.run(main)
diff --git a/scripts/embed/weaviate_example.py b/scripts/embed/weaviate_example.py
new file mode 100644
index 0000000..03dcb56
--- /dev/null
+++ b/scripts/embed/weaviate_example.py
@@ -0,0 +1,38 @@
+import os
+from pprint import pprint
+
+from dotenv import load_dotenv
+
+import weaviate
+from weaviate.collections.classes.grpc import MetadataQuery
+
+load_dotenv()
+WV_HOST = os.getenv("WV_URL", "localhost")
+WV_PORT = int(os.getenv("WV_PORT", 8080))
+WV_GRPC_PORT = int(os.getenv("WV_GRPC_PORT", 50051))
+WV_API_KEY = os.getenv("WV_API_KEY", None)
+
+QUERY_PROMPT = "zapytanie: {query}"
+
+# NOTE: This is standalone example, for convenience you can use judgements/data/weaviate_db.py
+with weaviate.connect_to_local(
+    host=WV_HOST,
+    port=WV_PORT,
+    grpc_port=WV_GRPC_PORT,
+    auth_credentials=weaviate.auth.Auth.api_key(WV_API_KEY),
+) as client:
+    coll = client.collections.get("judgement_chunks")
+    response = coll.query.hybrid(
+        query=QUERY_PROMPT.format(query="oskarżony handlował narkotykami"),
+        limit=2,
+        return_metadata=MetadataQuery(distance=True),
+    )
+
+for o in response.objects:
+    print(
+        f"{o.properties['judgment_id']} - {o.properties['chunk_id']}".center(
+            100,
+            "=",
+        )
+    )
+    pprint(o.properties["chunk_text"])
diff --git a/weaviate/README.md b/weaviate/README.md
new file mode 100644
index 0000000..add67b5
--- /dev/null
+++ b/weaviate/README.md
@@ -0,0 +1,16 @@
+# Weaviate deployment
+
+## Instruction
+1. Prepare `.env` file with proper user names and API tokens
+    ```bash
+    cp example.env .env
+    ```
+2. Run containers through docker-compose
+    ```bash
+    docker compose up -d
+    ```
+
+## Remarks
+* Persistent data will be stored inside mounted `./weaviate_data` path
+* Deployment was tested on machine with 16 CPU, 64GB memory, and without GPU (vectors were computed outside weaviate instance, `t2v-transformers` used only for inference)
+* see [scripts/embed/weaviate_example.py](../scripts/embed/weaviate_example.py) to see search example usage
diff --git a/weaviate/docker-compose.yaml b/weaviate/docker-compose.yaml
new file mode 100644
index 0000000..e7c0bb8
--- /dev/null
+++ b/weaviate/docker-compose.yaml
@@ -0,0 +1,33 @@
+name: weaviate
+services:
+  weaviate:
+    command:
+      - --host
+      - 0.0.0.0
+      - --port
+      - '8080'
+      - --scheme
+      - http
+    image: cr.weaviate.io/semitechnologies/weaviate:1.26.4
+    depends_on:
+      - t2v-transformers
+    ports:
+      - 8080:8080
+      - 50051:50051
+    volumes:
+      - ./weaviate_data:/var/lib/weaviate
+    restart: on-failure:0
+    env_file:
+      - path: .env
+        required: true
+    cpu_count: 14
+    mem_limit: 60g
+
+  t2v-transformers:
+    build:
+      context: .
+      dockerfile: hf_transformers.dockerfile
+      args:
+        - MODEL_NAME=sdadas/mmlw-roberta-large
+    environment:
+      ENABLE_CUDA: 0 # Set to 1 to enable
diff --git a/weaviate/example.env b/weaviate/example.env
new file mode 100644
index 0000000..5b4fa47
--- /dev/null
+++ b/weaviate/example.env
@@ -0,0 +1,15 @@
+QUERY_DEFAULTS_LIMIT=25
+PERSISTENCE_DATA_PATH='/var/lib/weaviate'
+DEFAULT_VECTORIZER_MODULE='none'
+ENABLE_MODULES=''
+CLUSTER_HOSTNAME='juddges-vm'
+
+# vectorization
+ENABLE_MODULES='text2vec-transformers'
+TRANSFORMERS_INFERENCE_API=http://t2v-transformers:8080
+
+# authentication
+AUTHENTICATION_ANONYMOUS_ACCESS_ENABLED='false'
+AUTHENTICATION_APIKEY_ENABLED='true'
+AUTHENTICATION_APIKEY_ALLOWED_KEYS='<API_KEY>,<API_KEY>'
+AUTHENTICATION_APIKEY_USERS='<USER_NAME>,<USER_NAME>'
diff --git a/weaviate/hf_transformers.dockerfile b/weaviate/hf_transformers.dockerfile
new file mode 100644
index 0000000..5bb8916
--- /dev/null
+++ b/weaviate/hf_transformers.dockerfile
@@ -0,0 +1,3 @@
+FROM semitechnologies/transformers-inference:custom
+ARG MODEL_NAME
+RUN ./download.py