diff --git a/configs/model/Bielik-11B-v2.2-Instruct-fine-tuned.yaml b/configs/model/Bielik-11B-v2.2-Instruct-fine-tuned.yaml
new file mode 100644
index 0000000..4e29955
--- /dev/null
+++ b/configs/model/Bielik-11B-v2.2-Instruct-fine-tuned.yaml
@@ -0,0 +1,11 @@
+name: speakleash/Bielik-11B-v2.2-Instruct
+tokenizer_name: ${.name}
+
+adapter_path: data/experiments/fine-tune/Bielik-11B-v2.2-Instruct/pl-court-instruct/checkpoint-1500
+
+max_seq_length: 7_900
+batch_size: 1
+padding: longest
+use_4bit: true
+
+use_unsloth: true
diff --git a/data/experiments/fine-tune/Bielik-11B-v2.2-Instruct/.gitignore b/data/experiments/fine-tune/Bielik-11B-v2.2-Instruct/.gitignore
new file mode 100644
index 0000000..c5110ed
--- /dev/null
+++ b/data/experiments/fine-tune/Bielik-11B-v2.2-Instruct/.gitignore
@@ -0,0 +1 @@
+/pl-court-instruct
diff --git a/data/experiments/predict/pl-court-instruct/Bielik-11B-v2.2-Instruct-fine-tuned/.gitignore b/data/experiments/predict/pl-court-instruct/Bielik-11B-v2.2-Instruct-fine-tuned/.gitignore
new file mode 100644
index 0000000..a29b11b
--- /dev/null
+++ b/data/experiments/predict/pl-court-instruct/Bielik-11B-v2.2-Instruct-fine-tuned/.gitignore
@@ -0,0 +1,9 @@
+/outputs_42.json
+/outputs_7312.json
+/outputs_997.json
+/metrics_997.json
+/metrics_42.json
+/metrics_7312.json
+/judge_metrics_7312.json
+/judge_metrics_42.json
+/judge_metrics_997.json
diff --git a/data/experiments/predict/pl-court-instruct/Bielik-11B-v2.2-Instruct/.gitignore b/data/experiments/predict/pl-court-instruct/Bielik-11B-v2.2-Instruct/.gitignore
index 3e07b1d..4280e22 100644
--- a/data/experiments/predict/pl-court-instruct/Bielik-11B-v2.2-Instruct/.gitignore
+++ b/data/experiments/predict/pl-court-instruct/Bielik-11B-v2.2-Instruct/.gitignore
@@ -1 +1,9 @@
 /outputs_997.json
+/outputs_42.json
+/outputs_7312.json
+/metrics_997.json
+/metrics_42.json
+/metrics_7312.json
+/judge_metrics_42.json
+/judge_metrics_7312.json
+/judge_metrics_997.json
diff --git a/data/experiments/predict/pl-court-instruct/metrics_judge_summary.md b/data/experiments/predict/pl-court-instruct/metrics_judge_summary.md
index 8afa231..bde8d3f 100644
--- a/data/experiments/predict/pl-court-instruct/metrics_judge_summary.md
+++ b/data/experiments/predict/pl-court-instruct/metrics_judge_summary.md
@@ -1,5 +1,17 @@
 | llm                                           | assessment      | court_name      | date            | department_name   | judges          | legal_bases     | recorder        | signature       |
 |:----------------------------------------------|:----------------|:----------------|:----------------|:------------------|:----------------|:----------------|:----------------|:----------------|
+| Bielik-11B-v2.2-Instruct                      | (Correct)       | 0.868 (± 0.003) | 0.914 (± 0.003) | 0.833 (± 0.003)   | 0.514 (± 0.004) | 0.024 (± 0.001) | 0.829 (± 0.001) | 0.837 (± 0.001) |
+| Bielik-11B-v2.2-Instruct                      | (Disagreement)  | 0.037 (± 0.001) | 0.023 (± 0.000) | 0.067 (± 0.002)   | 0.160 (± 0.001) | 0.599 (± 0.002) | 0.005 (± 0.001) | 0.018 (± 0.001) |
+| Bielik-11B-v2.2-Instruct                      | (Subset)        | 0.012 (± 0.001) | 0.000 (± 0.000) | 0.019 (± 0.001)   | 0.020 (± 0.000) | 0.060 (± 0.000) | 0.041 (± 0.001) | 0.004 (± 0.001) |
+| Bielik-11B-v2.2-Instruct                      | (Superset)      | 0.020 (± 0.001) | 0.000 (± 0.000) | 0.017 (± 0.001)   | 0.242 (± 0.002) | 0.154 (± 0.002) | 0.002 (± 0.001) | 0.007 (± 0.000) |
+| Bielik-11B-v2.2-Instruct                      | (empty-answer)  | 0.064 (± 0.003) | 0.064 (± 0.003) | 0.064 (± 0.003)   | 0.065 (± 0.003) | 0.163 (± 0.004) | 0.124 (± 0.001) | 0.134 (± 0.002) |
+| Bielik-11B-v2.2-Instruct                      | (non-evaluable) | 0.000 (± 0.000) | 0.000 (± 0.000) | 0.000 (± 0.000)   | 0.000 (± 0.000) | 0.000 (± 0.000) | 0.000 (± 0.000) | 0.000 (± 0.000) |
+| Bielik-11B-v2.2-Instruct-fine-tuned           | (Correct)       | 0.859 (± 0.002) | 0.847 (± 0.001) | 0.848 (± 0.001)   | 0.824 (± 0.003) | 0.066 (± 0.003) | 0.647 (± 0.011) | 0.529 (± 0.007) |
+| Bielik-11B-v2.2-Instruct-fine-tuned           | (Disagreement)  | 0.009 (± 0.000) | 0.022 (± 0.001) | 0.009 (± 0.000)   | 0.014 (± 0.001) | 0.544 (± 0.002) | 0.044 (± 0.002) | 0.059 (± 0.007) |
+| Bielik-11B-v2.2-Instruct-fine-tuned           | (Subset)        | 0.000 (± 0.000) | 0.000 (± 0.000) | 0.006 (± 0.000)   | 0.011 (± 0.001) | 0.010 (± 0.001) | 0.053 (± 0.003) | 0.038 (± 0.006) |
+| Bielik-11B-v2.2-Instruct-fine-tuned           | (Superset)      | 0.000 (± 0.000) | 0.000 (± 0.000) | 0.006 (± 0.000)   | 0.020 (± 0.001) | 0.164 (± 0.004) | 0.001 (± 0.001) | 0.001 (± 0.000) |
+| Bielik-11B-v2.2-Instruct-fine-tuned           | (empty-answer)  | 0.132 (± 0.002) | 0.132 (± 0.002) | 0.132 (± 0.002)   | 0.132 (± 0.002) | 0.217 (± 0.002) | 0.255 (± 0.012) | 0.373 (± 0.013) |
+| Bielik-11B-v2.2-Instruct-fine-tuned           | (non-evaluable) | 0.000 (± 0.000) | 0.000 (± 0.000) | 0.000 (± 0.000)   | 0.000 (± 0.000) | 0.000 (± 0.000) | 0.000 (± 0.000) | 0.000 (± 0.000) |
 | Bielik-7B-Instruct-v0.1                       | (Correct)       | 0.000 (± 0.000) | 0.001 (± 0.001) | 0.000 (± 0.000)   | 0.001 (± 0.001) | 0.000 (± 0.000) | 0.000 (± 0.000) | 0.000 (± 0.000) |
 | Bielik-7B-Instruct-v0.1                       | (Disagreement)  | 0.000 (± 0.000) | 0.001 (± 0.000) | 0.001 (± 0.000)   | 0.002 (± 0.002) | 0.002 (± 0.001) | 0.001 (± 0.001) | 0.001 (± 0.000) |
 | Bielik-7B-Instruct-v0.1                       | (Subset)        | 0.000 (± 0.000) | 0.000 (± 0.000) | 0.000 (± 0.000)   | 0.000 (± 0.000) | 0.000 (± 0.000) | 0.000 (± 0.000) | 0.000 (± 0.000) |
diff --git a/data/experiments/predict/pl-court-instruct/metrics_ngram_summary.md b/data/experiments/predict/pl-court-instruct/metrics_ngram_summary.md
index 0c4d7b1..84bcdb7 100644
--- a/data/experiments/predict/pl-court-instruct/metrics_ngram_summary.md
+++ b/data/experiments/predict/pl-court-instruct/metrics_ngram_summary.md
@@ -1,5 +1,7 @@
 | llm                                           | full_text_chrf   | court_name      | date            | department_name   | judges          | legal_bases     | recorder        | signature       |
 |:----------------------------------------------|:-----------------|:----------------|:----------------|:------------------|:----------------|:----------------|:----------------|:----------------|
+| Bielik-11B-v2.2-Instruct                      | 0.679 (± 0.001)  | 0.891 (± 0.002) | 0.921 (± 0.002) | 0.902 (± 0.003)   | 0.858 (± 0.003) | 0.472 (± 0.001) | 0.842 (± 0.001) | 0.790 (± 0.002) |
+| Bielik-11B-v2.2-Instruct-fine-tuned           | 0.749 (± 0.001)  | 0.865 (± 0.001) | 0.856 (± 0.001) | 0.864 (± 0.001)   | 0.848 (± 0.002) | 0.548 (± 0.000) | 0.695 (± 0.011) | 0.589 (± 0.010) |
 | Bielik-7B-Instruct-v0.1                       | 0.354 (± 0.001)  | 0.000 (± 0.000) | 0.001 (± 0.000) | 0.001 (± 0.000)   | 0.001 (± 0.000) | 0.001 (± 0.000) | 0.000 (± 0.000) | 0.000 (± 0.000) |
 | Bielik-7B-Instruct-v0.1-fine-tuned            | 0.717 (± 0.000)  | 0.890 (± 0.007) | 0.863 (± 0.007) | 0.886 (± 0.007)   | 0.879 (± 0.007) | 0.465 (± 0.004) | 0.639 (± 0.001) | 0.459 (± 0.002) |
 | Unsloth-Llama-3-8B-Instruct                   | 0.579 (± 0.001)  | 0.863 (± 0.002) | 0.946 (± 0.002) | 0.909 (± 0.002)   | 0.912 (± 0.003) | 0.362 (± 0.002) | 0.735 (± 0.004) | 0.686 (± 0.004) |
diff --git a/dvc.lock b/dvc.lock
index ad89cf6..7c5fb51 100644
--- a/dvc.lock
+++ b/dvc.lock
@@ -4,5417 +4,3026 @@ stages:
     cmd: PYTHONPATH=. python scripts/embed/aggregate_embeddings.py  --embeddings-dir
       data/embeddings/pl-court-raw/mmlw-roberta-large/all_embeddings
     deps:
-    - path: data/embeddings/pl-court-raw/mmlw-roberta-large/all_embeddings
-      hash: md5
+    - hash: md5
       md5: 1a086db46b90b0f3c4c66c3ecefe8adb.dir
-      size: 24415235644
       nfiles: 53
-    - path: scripts/embed/aggregate_embeddings.py
-      hash: md5
+      path: data/embeddings/pl-court-raw/mmlw-roberta-large/all_embeddings
+      size: 24415235644
+    - hash: md5
       md5: edb817e03c0c1c20822eda0e445f5083
+      path: scripts/embed/aggregate_embeddings.py
       size: 1839
     outs:
-    - path: data/embeddings/pl-court-raw/mmlw-roberta-large/agg_embeddings.pt
-      hash: md5
+    - hash: md5
       md5: 0d84b4da5513feeb6ca9bad70a2ff164
+      path: data/embeddings/pl-court-raw/mmlw-roberta-large/agg_embeddings.pt
       size: 1725566207
   build_graph_dataset:
     cmd: PYTHONPATH=. python scripts/dataset/build_graph_dataset.py --dataset-dir
       data/datasets/pl/raw  --embeddings-root-dir data/embeddings/pl-court-raw/mmlw-roberta-large/
       --target-dir data/datasets/pl/graph
     deps:
-    - path: data/datasets/pl/raw
-      hash: md5
+    - hash: md5
       md5: 5dd44be2eea852bcce3d0918ff8b97da.dir
-      size: 10234880729
       nfiles: 17
-    - path: data/embeddings/pl-court-raw/mmlw-roberta-large/agg_embeddings.pt
-      hash: md5
+      path: data/datasets/pl/raw
+      size: 10234880729
+    - hash: md5
       md5: 0d84b4da5513feeb6ca9bad70a2ff164
+      path: data/embeddings/pl-court-raw/mmlw-roberta-large/agg_embeddings.pt
       size: 1725566207
-    - path: data/embeddings/pl-court-raw/mmlw-roberta-large/all_embeddings/config.yaml
-      hash: md5
+    - hash: md5
       md5: fbb5585b8c3ef28255801d38c9248f8e
+      path: data/embeddings/pl-court-raw/mmlw-roberta-large/all_embeddings/config.yaml
       size: 502
-    - path: juddges/data/pl_court_graph.py
-      hash: md5
+    - hash: md5
       md5: 730e3d92be26408bd6dc26606b4c22ff
+      path: juddges/data/pl_court_graph.py
       size: 4974
-    - path: scripts/dataset/build_graph_dataset.py
-      hash: md5
+    - hash: md5
       md5: e7f76dc4f24d884291e1f0b66d8244a8
+      path: scripts/dataset/build_graph_dataset.py
       size: 1159
     outs:
-    - path: data/datasets/pl/graph/data
-      hash: md5
+    - hash: md5
       md5: f2820796cff4578c11ffcb0fa6cdadd7.dir
-      size: 1823760294
       nfiles: 2
-    - path: data/datasets/pl/graph/metadata.yaml
-      hash: md5
+      path: data/datasets/pl/graph/data
+      size: 1823760294
+    - hash: md5
       md5: 68b09dd0ce741e6ee1fff4e37c954fa6
+      path: data/datasets/pl/graph/metadata.yaml
       size: 564
-  build_instruct_dataset:
-    cmd: PYTHONPATH=. python scripts/dataset/build_instruct_dataset.py --dataset-dir
-      data/datasets/pl/raw --repo-id JuDDGES/pl-court-instruct
+  build_instruct_dataset_en:
+    cmd: PYTHONPATH=. python scripts/dataset/build_instruct_dataset_en.py --repo-id
+      JuDDGES/en-court-instruct
     deps:
-    - path: data/datasets/pl/raw
-      hash: md5
-      md5: 5dd44be2eea852bcce3d0918ff8b97da.dir
-      size: 10234880729
-      nfiles: 17
-    - path: scripts/dataset/build_instruct_dataset.py
-      hash: md5
-      md5: 5038c49e847d847ea3fd05903624d5c9
-      size: 5696
+    - hash: md5
+      md5: 39e530fbd8c7f3a696e117ee13578e1f
+      path: scripts/dataset/build_instruct_dataset_en.py
+      size: 5203
   embed@mmlw-roberta-large:
     cmd: PYTHONPATH=. python scripts/embed/embed_text.py embedding_model=mmlw-roberta-large
     deps:
-    - path: configs/embedding.yaml
-      hash: md5
+    - hash: md5
       md5: 22fa56f7d7d5a1c1372a8a8b57b02ba8
+      path: configs/embedding.yaml
       size: 467
-    - path: configs/embedding_model/mmlw-roberta-large.yaml
-      hash: md5
+    - hash: md5
       md5: 22f36cfd196c0fdc3cfd8a036d52b606
+      path: configs/embedding_model/mmlw-roberta-large.yaml
       size: 52
-    - path: data/datasets/pl/raw
-      hash: md5
+    - hash: md5
       md5: 5dd44be2eea852bcce3d0918ff8b97da.dir
-      size: 10234880729
       nfiles: 17
-    - path: scripts/embed/embed_text.py
-      hash: md5
+      path: data/datasets/pl/raw
+      size: 10234880729
+    - hash: md5
       md5: a2953ae4974ef96d62063b5c2711e967
+      path: scripts/embed/embed_text.py
       size: 3549
     outs:
-    - path: data/embeddings/pl-court-raw/mmlw-roberta-large/all_embeddings
-      hash: md5
+    - hash: md5
       md5: 1a086db46b90b0f3c4c66c3ecefe8adb.dir
-      size: 24415235644
       nfiles: 53
-  evaluate@Unsloth-Llama-3-8B-Instruct:
-    cmd: PYTHONPATH=. python scripts/sft/evaluate.py  --output-file 
-      data/experiments/predict/pl-court-instruct/outputs_Unsloth-Llama-3-8B-Instruct.json
-    deps:
-    - path: 
-        data/experiments/predict/pl-court-instruct/outputs_Unsloth-Llama-3-8B-Instruct.json
-      hash: md5
-      md5: df2f1d464152f87737c8ebb5b0673854
-      size: 2179383
-    - path: scripts/sft/evaluate.py
-      hash: md5
-      md5: 66211e8b6f056234240f094896966a9c
-      size: 578
-    outs:
-    - path: 
-        data/experiments/predict/pl-court-instruct/metrics_Unsloth-Llama-3-8B-Instruct.json
-      hash: md5
-      md5: 521a731cc2c45d3eda0656a8e69d505b
-      size: 307
-  evaluate@Unsloth-Llama-3-8B-Instruct-fine-tuned:
-    cmd: PYTHONPATH=. python scripts/sft/evaluate.py  --output-file 
-      data/experiments/predict/pl-court-instruct/outputs_Unsloth-Llama-3-8B-Instruct-fine-tuned.json
-    deps:
-    - path: 
-        data/experiments/predict/pl-court-instruct/outputs_Unsloth-Llama-3-8B-Instruct-fine-tuned.json
-      hash: md5
-      md5: 9199da7e04fb35cc1ce2bbe9dd5cd274
-      size: 1891254
-    - path: scripts/sft/evaluate.py
-      hash: md5
-      md5: 66211e8b6f056234240f094896966a9c
-      size: 578
-    outs:
-    - path: 
-        data/experiments/predict/pl-court-instruct/metrics_Unsloth-Llama-3-8B-Instruct-fine-tuned.json
-      hash: md5
-      md5: 6a0eb30a14687342bc86ae80253cd60c
-      size: 306
-  evaluate@Unsloth-Mistral-7B-Instruct-v0.3:
-    cmd: PYTHONPATH=. python scripts/sft/evaluate.py  --output-file 
-      data/experiments/predict/pl-court-instruct/outputs_Unsloth-Mistral-7B-Instruct-v0.3.json
-    deps:
-    - path: 
-        data/experiments/predict/pl-court-instruct/outputs_Unsloth-Mistral-7B-Instruct-v0.3.json
-      hash: md5
-      md5: c2e03f3fbd29c744023bdac7e1007265
-      size: 2007040
-    - path: scripts/sft/evaluate.py
-      hash: md5
-      md5: 66211e8b6f056234240f094896966a9c
-      size: 578
-    outs:
-    - path: 
-        data/experiments/predict/pl-court-instruct/metrics_Unsloth-Mistral-7B-Instruct-v0.3.json
-      hash: md5
-      md5: 091b8888275600052dd2dcdd36a55588
-      size: 305
-  evaluate@Unsloth-Mistral-7B-Instruct-v0.3-fine-tuned:
-    cmd: PYTHONPATH=. python scripts/sft/evaluate.py  --output-file 
-      data/experiments/predict/pl-court-instruct/outputs_Unsloth-Mistral-7B-Instruct-v0.3-fine-tuned.json
-    deps:
-    - path: 
-        data/experiments/predict/pl-court-instruct/outputs_Unsloth-Mistral-7B-Instruct-v0.3-fine-tuned.json
-      hash: md5
-      md5: a4fda5774b367e8924cf07f3bf271922
-      size: 1834778
-    - path: scripts/sft/evaluate.py
-      hash: md5
-      md5: 66211e8b6f056234240f094896966a9c
-      size: 578
-    outs:
-    - path: 
-        data/experiments/predict/pl-court-instruct/metrics_Unsloth-Mistral-7B-Instruct-v0.3-fine-tuned.json
-      hash: md5
-      md5: 3b3589929112cb2f199044d240e87bcc
-      size: 305
-  instruct_dataset_readme:
-    cmd: jupyter nbconvert  --no-input  --to markdown  --execute nbs/Data/03_Dataset_Description_Instruct.ipynb
-      --output-dir data/datasets/pl/readme/instruct --output README
+      path: data/embeddings/pl-court-raw/mmlw-roberta-large/all_embeddings
+      size: 24415235644
+  evaluate_api_models@en-court-instruct-open_ai_gpt-4o-997:
+    cmd: PYTHONPATH=. python scripts/sft/evaluate.py --output-file data/experiments/predict/en-court-instruct/open_ai_gpt-4o/outputs_997.json
+      --num-proc=-1
     deps:
-    - path: nbs/Data/03_Dataset_Description_Instruct.ipynb
-      hash: md5
-      md5: 27e6d517445028d45e5c40b22febece4
-      size: 16215
+    - hash: md5
+      md5: 8f70e2baa0b0ae8a320577f5c8a60011
+      path: data/experiments/predict/en-court-instruct/open_ai_gpt-4o/outputs_997.json
+      size: 679432
+    - hash: md5
+      md5: 0644efb76af2c5461185e37a07ba2c17
+      path: scripts/sft/evaluate.py
+      size: 697
     outs:
-    - path: data/datasets/pl/readme/instruct/
-      hash: md5
-      md5: de02794df3d74d86f8610f040a17dcbe.dir
-      size: 144326
-      nfiles: 5
-  predict@Unsloth-Llama-3-8B-Instruct:
-    cmd: PYTHONPATH=. python scripts/sft/predict.py model=Unsloth-Llama-3-8B-Instruct
-    deps:
-    - path: configs/model/Unsloth-Llama-3-8B-Instruct.yaml
-      hash: md5
-      md5: 1b4c0353b8c41fd3656ec5cf15eb6c2b
-      size: 161
-    - path: configs/predict.yaml
-      hash: md5
-      md5: 888667e56c54157be4d75f85657cf478
-      size: 494
-    - path: scripts/sft/predict.py
-      hash: md5
-      md5: 1dc3e25365c4200d1e26e04b41d6b831
-      size: 3188
-    outs:
-    - path: 
-        data/experiments/predict/pl-court-instruct/outputs_Unsloth-Llama-3-8B-Instruct.json
-      hash: md5
-      md5: df2f1d464152f87737c8ebb5b0673854
-      size: 2179383
-  predict@Unsloth-Llama-3-8B-Instruct-fine-tuned:
-    cmd: PYTHONPATH=. python scripts/sft/predict.py model=Unsloth-Llama-3-8B-Instruct-fine-tuned
-    deps:
-    - path: configs/model/Unsloth-Llama-3-8B-Instruct-fine-tuned.yaml
-      hash: md5
-      md5: dd00fc3994bdc95baf1f17de7b026a0f
-      size: 245
-    - path: configs/predict.yaml
-      hash: md5
-      md5: 7422a2c12c7d31d7b68dbe89f02dab5a
-      size: 532
-    - path: scripts/sft/predict.py
-      hash: md5
-      md5: 150d40027312348c19a82ca4f89b4cc6
-      size: 2735
-    outs:
-    - path: 
-        data/experiments/predict/pl-court-instruct/outputs_Unsloth-Llama-3-8B-Instruct-fine-tuned.json
-      hash: md5
-      md5: 5c49073109ca97d16501ca74fc568df7
-      size: 1742376
-  predict@Unsloth-Mistral-7B-Instruct-v0.3:
-    cmd: PYTHONPATH=. python scripts/sft/predict.py model=Unsloth-Mistral-7B-Instruct-v0.3
-    deps:
-    - path: configs/model/Unsloth-Mistral-7B-Instruct-v0.3.yaml
-      hash: md5
-      md5: 71dbbb0a8a2454c7c0210e2d1acd859d
-      size: 167
-    - path: configs/predict.yaml
-      hash: md5
-      md5: 888667e56c54157be4d75f85657cf478
-      size: 494
-    - path: scripts/sft/predict.py
-      hash: md5
-      md5: 1dc3e25365c4200d1e26e04b41d6b831
-      size: 3188
-    outs:
-    - path: 
-        data/experiments/predict/pl-court-instruct/outputs_Unsloth-Mistral-7B-Instruct-v0.3.json
-      hash: md5
-      md5: c2e03f3fbd29c744023bdac7e1007265
-      size: 2007040
-  predict@Unsloth-Mistral-7B-Instruct-v0.3-fine-tuned:
-    cmd: PYTHONPATH=. python scripts/sft/predict.py model=Unsloth-Mistral-7B-Instruct-v0.3-fine-tuned
-    deps:
-    - path: configs/model/Unsloth-Mistral-7B-Instruct-v0.3-fine-tuned.yaml
-      hash: md5
-      md5: 8e8b380ef9bc65715cb833ce104cda20
-      size: 256
-    - path: configs/predict.yaml
-      hash: md5
-      md5: 888667e56c54157be4d75f85657cf478
-      size: 494
-    - path: scripts/sft/predict.py
-      hash: md5
-      md5: 1dc3e25365c4200d1e26e04b41d6b831
-      size: 3188
-    outs:
-    - path: 
-        data/experiments/predict/pl-court-instruct/outputs_Unsloth-Mistral-7B-Instruct-v0.3-fine-tuned.json
-      hash: md5
-      md5: a4fda5774b367e8924cf07f3bf271922
-      size: 1834778
-  raw_dataset_readme:
-    cmd: jupyter nbconvert --no-input --to markdown --execute 'nbs/Dataset Cards/01_Dataset_Description_Raw.ipynb'
-      --output-dir data/datasets/pl/readme/raw --output README
+    - hash: md5
+      md5: ac30bcf3c40000cab61e0914b56aba85
+      path: data/experiments/predict/en-court-instruct/open_ai_gpt-4o/metrics_997.json
+      size: 157
+  evaluate_api_models@en-court-instruct-open_ai_gpt-4o-mini-997:
+    cmd: PYTHONPATH=. python scripts/sft/evaluate.py --output-file data/experiments/predict/en-court-instruct/open_ai_gpt-4o-mini/outputs_997.json
+      --num-proc=-1
     deps:
-    - path: data/datasets/pl/raw
-      hash: md5
-      md5: 622ba21868561c26fb6877ad95bfb5c5.dir
-      size: 10234505621
-      nfiles: 17
-    - path: nbs/Dataset Cards/01_Dataset_Description_Raw.ipynb
-      hash: md5
-      md5: 11b39233ef419de713493cb5ec8bcfd9
-      size: 77118
+    - hash: md5
+      md5: 2a0819011b3eac56e497201a9f67e310
+      path: data/experiments/predict/en-court-instruct/open_ai_gpt-4o-mini/outputs_997.json
+      size: 690306
+    - hash: md5
+      md5: 0644efb76af2c5461185e37a07ba2c17
+      path: scripts/sft/evaluate.py
+      size: 697
     outs:
-    - path: data/datasets/pl/readme/raw/
-      hash: md5
-      md5: c82b8238e3043491c6fa49e9641e8dac.dir
-      size: 475420
-      nfiles: 8
-  sft_unsloth@Unsloth-Llama-3-8B-Instruct:
-    cmd: PYTHONPATH=. python scripts/sft/fine_tune_llm.py model=Unsloth-Llama-3-8B-Instruct
+    - hash: md5
+      md5: d70eb0821aff9c9e874a421b80f7f697
+      path: data/experiments/predict/en-court-instruct/open_ai_gpt-4o-mini/metrics_997.json
+      size: 155
+  evaluate_api_models@pl-court-instruct-open_ai_gpt-4o-997:
+    cmd: PYTHONPATH=. python scripts/sft/evaluate.py --output-file data/experiments/predict/pl-court-instruct/open_ai_gpt-4o/outputs_997.json
+      --num-proc=-1
     deps:
-    - path: configs/fine_tuning.yaml
-      hash: md5
-      md5: 3933c4faf5a478d0f9d3963c3b29e5cc
-      size: 1356
-    - path: configs/model/Unsloth-Llama-3-8B-Instruct.yaml
-      hash: md5
-      md5: 56a95874b3e77e7ffec11c00330da5b6
-      size: 176
-    - path: scripts/sft/fine_tune_llm.py
-      hash: md5
-      md5: 4b77ee1ea604cae18f17ca00cdb6988b
-      size: 4578
+    - hash: md5
+      md5: 7c5833fdd1419163b286baaa3d71e084
+      path: data/experiments/predict/pl-court-instruct/open_ai_gpt-4o/outputs_997.json
+      size: 1965252
+    - hash: md5
+      md5: 0644efb76af2c5461185e37a07ba2c17
+      path: scripts/sft/evaluate.py
+      size: 697
     outs:
-    - path: data/experiments/fine-tune/Unsloth-Llama-3-8B-Instruct/pl-court-instruct
-      hash: md5
-      md5: d9850d30d221f257e1453a66a6c1eef3.dir
-      size: 784320233
-      nfiles: 33
-  sft_unsloth@Unsloth-Mistral-7B-Instruct-v0.3:
-    cmd: PYTHONPATH=. python scripts/sft/fine_tune_llm.py model=Unsloth-Mistral-7B-Instruct-v0.3
+    - hash: md5
+      md5: 65c808d4aebd8efe37b94a5128a19de6
+      path: data/experiments/predict/pl-court-instruct/open_ai_gpt-4o/metrics_997.json
+      size: 306
+  evaluate_api_models@pl-court-instruct-open_ai_gpt-4o-mini-997:
+    cmd: PYTHONPATH=. python scripts/sft/evaluate.py --output-file data/experiments/predict/pl-court-instruct/open_ai_gpt-4o-mini/outputs_997.json
+      --num-proc=-1
     deps:
-    - path: configs/fine_tuning.yaml
-      hash: md5
-      md5: 3933c4faf5a478d0f9d3963c3b29e5cc
-      size: 1356
-    - path: configs/model/Unsloth-Mistral-7B-Instruct-v0.3.yaml
-      hash: md5
-      md5: d184e20107315876e7751bdc7c3841ad
-      size: 182
-    - path: scripts/sft/fine_tune_llm.py
-      hash: md5
-      md5: 4b77ee1ea604cae18f17ca00cdb6988b
-      size: 4578
+    - hash: md5
+      md5: 839c911f542cd7c60c9ae52ef95e9907
+      path: data/experiments/predict/pl-court-instruct/open_ai_gpt-4o-mini/outputs_997.json
+      size: 1812429
+    - hash: md5
+      md5: 0644efb76af2c5461185e37a07ba2c17
+      path: scripts/sft/evaluate.py
+      size: 697
     outs:
-    - path: data/experiments/fine-tune/Unsloth-Mistral-7B-Instruct-v0.3/pl-court-instruct
-      hash: md5
-      md5: 1b47e8203c533942e1903dd816f7a7f7.dir
-      size: 1518954466
-      nfiles: 66
-  summarize_metrics@data/experiments/predict/pl-court-instruct:
-    cmd: PYTHONPATH=. python scripts/sft/summarize_metrics.py --root-dir data/experiments/predict/pl-court-instruct
+    - hash: md5
+      md5: fe43f0d25b500a0f2fb2d8199b8034fd
+      path: data/experiments/predict/pl-court-instruct/open_ai_gpt-4o-mini/metrics_997.json
+      size: 305
+  evaluate_en@en-court-instruct-Unsloth-Llama-3-8B-Instruct-42:
+    cmd: PYTHONPATH=. python scripts/sft/evaluate.py --output-file data/experiments/predict/en-court-instruct/Unsloth-Llama-3-8B-Instruct/outputs_42.json
+      --num-proc=-1
     deps:
-    - path: scripts/sft/summarize_metrics.py
-      hash: md5
-      md5: eb5736f5709f9773acf21bfc28c2e012
-      size: 2975
+    - hash: md5
+      md5: 761018c0a306fbee63dad2fbc119110d
+      path: data/experiments/predict/en-court-instruct/Unsloth-Llama-3-8B-Instruct/outputs_42.json
+      size: 821683
+    - hash: md5
+      md5: 0644efb76af2c5461185e37a07ba2c17
+      path: scripts/sft/evaluate.py
+      size: 697
     outs:
-    - path: data/experiments/predict/pl-court-instruct/metrics_judge_summary.md
-      hash: md5
-      md5: e5fdc8ce94d75886ec2a2b291b2b63c5
-      size: 12557
-    - path: data/experiments/predict/pl-court-instruct/metrics_ngram_summary.md
-      hash: md5
-      md5: e33a4543e486ee107e787250da19b7fa
-      size: 2853
-  evaluate_llm_as_judge@Unsloth-Llama-3-8B-Instruct-Unsloth-Llama-3-8B-Instruct:
-    cmd: PYTHONPATH=. python scripts/sft/evaluate_llm_as_judge.py model=Unsloth-Llama-3-8B-Instruct
-      answers_file=data/experiments/predict/pl-court-instruct/outputs_Unsloth-Llama-3-8B-Instruct.json
-      out_metric_file=data/experiments/llm_as_judge/pl-court-instruct/judge_Unsloth-Llama-3-8B-Instruct_metrics_Unsloth-Llama-3-8B-Instruct.json
-      out_predictions_file=data/experiments/llm_as_judge/pl-court-instruct/judge_Unsloth-Llama-3-8B-Instruct_predictions_Unsloth-Llama-3-8B-Instruct.json
-    deps:
-    - path: 
-        data/experiments/predict/pl-court-instruct/outputs_Unsloth-Llama-3-8B-Instruct.json
-      hash: md5
-      md5: df2f1d464152f87737c8ebb5b0673854
-      size: 2179383
-    - path: scripts/sft/evaluate_llm_as_judge.py
-      hash: md5
-      md5: 55ffa83e2778e921bdfc677889e45a23
-      size: 3676
-    outs:
-    - path: 
-        data/experiments/llm_as_judge/pl-court-instruct/judge_Unsloth-Llama-3-8B-Instruct_predictions_Unsloth-Llama-3-8B-Instruct.json
-      hash: md5
-      md5: d0be277f3585e4d71d9551cd96851183
-      size: 54800
-  predict@Unsloth-Mistral-7B-Instruct-v0.3-997:
-    cmd: PYTHONPATH=. python scripts/sft/predict.py model=Unsloth-Mistral-7B-Instruct-v0.3
-      random_seed=997 
-      output_file=data/experiments/predict/pl-court-instruct/Unsloth-Mistral-7B-Instruct-v0.3/outputs_997.json
-    deps:
-    - path: configs/model/Unsloth-Mistral-7B-Instruct-v0.3.yaml
-      hash: md5
-      md5: d184e20107315876e7751bdc7c3841ad
-      size: 182
-    - path: configs/predict.yaml
-      hash: md5
-      md5: 5fc8b9ac571d4a2209d7d866697252ab
-      size: 402
-    - path: scripts/sft/predict.py
-      hash: md5
-      md5: f9acd63cd4d682ae2242d7b51f0d974b
-      size: 3198
+    - hash: md5
+      md5: 265776ba10a7b24b66e6bac1131e0c48
+      path: data/experiments/predict/en-court-instruct/Unsloth-Llama-3-8B-Instruct/metrics_42.json
+      size: 149
+  evaluate_en@en-court-instruct-Unsloth-Llama-3-8B-Instruct-7312:
+    cmd: PYTHONPATH=. python scripts/sft/evaluate.py --output-file data/experiments/predict/en-court-instruct/Unsloth-Llama-3-8B-Instruct/outputs_7312.json
+      --num-proc=-1
+    deps:
+    - hash: md5
+      md5: a7361535b440251d6ce6232a15cfcdf2
+      path: data/experiments/predict/en-court-instruct/Unsloth-Llama-3-8B-Instruct/outputs_7312.json
+      size: 818877
+    - hash: md5
+      md5: 0644efb76af2c5461185e37a07ba2c17
+      path: scripts/sft/evaluate.py
+      size: 697
     outs:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Unsloth-Mistral-7B-Instruct-v0.3/outputs_997.json
-      hash: md5
-      md5: bbb883aa388b274bef3e9296df26f68f
-      size: 1795752
-  predict@Unsloth-Mistral-7B-Instruct-v0.3-fine-tuned-7312:
-    cmd: PYTHONPATH=. python scripts/sft/predict.py model=Unsloth-Mistral-7B-Instruct-v0.3-fine-tuned
-      random_seed=7312 
-      output_file=data/experiments/predict/pl-court-instruct/Unsloth-Mistral-7B-Instruct-v0.3-fine-tuned/outputs_7312.json
+    - hash: md5
+      md5: 97fa8dfaa5e57633e8fb6a7d073177f5
+      path: data/experiments/predict/en-court-instruct/Unsloth-Llama-3-8B-Instruct/metrics_7312.json
+      size: 147
+  evaluate_en@en-court-instruct-Unsloth-Llama-3-8B-Instruct-997:
+    cmd: PYTHONPATH=. python scripts/sft/evaluate.py --output-file data/experiments/predict/en-court-instruct/Unsloth-Llama-3-8B-Instruct/outputs_997.json
+      --num-proc=-1
     deps:
-    - path: configs/model/Unsloth-Mistral-7B-Instruct-v0.3-fine-tuned.yaml
-      hash: md5
-      md5: 828382dc03dbed80cff4a3358321dc4a
-      size: 271
-    - path: configs/predict.yaml
-      hash: md5
-      md5: 5fc8b9ac571d4a2209d7d866697252ab
-      size: 402
-    - path: scripts/sft/predict.py
-      hash: md5
-      md5: f9acd63cd4d682ae2242d7b51f0d974b
-      size: 3198
+    - hash: md5
+      md5: 94924275d576271875fecf22c0f9b39e
+      path: data/experiments/predict/en-court-instruct/Unsloth-Llama-3-8B-Instruct/outputs_997.json
+      size: 817490
+    - hash: md5
+      md5: 0644efb76af2c5461185e37a07ba2c17
+      path: scripts/sft/evaluate.py
+      size: 697
     outs:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Unsloth-Mistral-7B-Instruct-v0.3-fine-tuned/outputs_7312.json
-      hash: md5
-      md5: 3d336675e54a706fae45349adbaf6ee4
-      size: 1793461
-  predict@Unsloth-Mistral-7B-Instruct-v0.3-42:
-    cmd: PYTHONPATH=. python scripts/sft/predict.py model=Unsloth-Mistral-7B-Instruct-v0.3
-      random_seed=42 
-      output_file=data/experiments/predict/pl-court-instruct/Unsloth-Mistral-7B-Instruct-v0.3/outputs_42.json
+    - hash: md5
+      md5: c3552161ec68d8cc6a8e5b75f02e22e2
+      path: data/experiments/predict/en-court-instruct/Unsloth-Llama-3-8B-Instruct/metrics_997.json
+      size: 147
+  evaluate_en@en-court-instruct-Unsloth-Llama-3-8B-Instruct-fine-tuned-en-42:
+    cmd: PYTHONPATH=. python scripts/sft/evaluate.py --output-file data/experiments/predict/en-court-instruct/Unsloth-Llama-3-8B-Instruct-fine-tuned-en/outputs_42.json
+      --num-proc=-1
     deps:
-    - path: configs/model/Unsloth-Mistral-7B-Instruct-v0.3.yaml
-      hash: md5
-      md5: d184e20107315876e7751bdc7c3841ad
-      size: 182
-    - path: configs/predict.yaml
-      hash: md5
-      md5: 5fc8b9ac571d4a2209d7d866697252ab
-      size: 402
-    - path: scripts/sft/predict.py
-      hash: md5
-      md5: f9acd63cd4d682ae2242d7b51f0d974b
-      size: 3198
+    - hash: md5
+      md5: 4246a4fafba5e130aac3db6c1c61ce30
+      path: data/experiments/predict/en-court-instruct/Unsloth-Llama-3-8B-Instruct-fine-tuned-en/outputs_42.json
+      size: 675578
+    - hash: md5
+      md5: 0644efb76af2c5461185e37a07ba2c17
+      path: scripts/sft/evaluate.py
+      size: 697
     outs:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Unsloth-Mistral-7B-Instruct-v0.3/outputs_42.json
-      hash: md5
-      md5: 4a9d3a2bb1dd47a732bd2df8102bc93f
-      size: 1799957
-  predict@Unsloth-Mistral-7B-Instruct-v0.3-fine-tuned-997:
-    cmd: PYTHONPATH=. python scripts/sft/predict.py model=Unsloth-Mistral-7B-Instruct-v0.3-fine-tuned
-      random_seed=997 
-      output_file=data/experiments/predict/pl-court-instruct/Unsloth-Mistral-7B-Instruct-v0.3-fine-tuned/outputs_997.json
+    - hash: md5
+      md5: 016d1c87b2925c6f941400d178bee018
+      path: data/experiments/predict/en-court-instruct/Unsloth-Llama-3-8B-Instruct-fine-tuned-en/metrics_42.json
+      size: 157
+  evaluate_en@en-court-instruct-Unsloth-Llama-3-8B-Instruct-fine-tuned-en-7312:
+    cmd: PYTHONPATH=. python scripts/sft/evaluate.py --output-file data/experiments/predict/en-court-instruct/Unsloth-Llama-3-8B-Instruct-fine-tuned-en/outputs_7312.json
+      --num-proc=-1
     deps:
-    - path: configs/model/Unsloth-Mistral-7B-Instruct-v0.3-fine-tuned.yaml
-      hash: md5
-      md5: 828382dc03dbed80cff4a3358321dc4a
-      size: 271
-    - path: configs/predict.yaml
-      hash: md5
-      md5: 5fc8b9ac571d4a2209d7d866697252ab
-      size: 402
-    - path: scripts/sft/predict.py
-      hash: md5
-      md5: f9acd63cd4d682ae2242d7b51f0d974b
-      size: 3198
+    - hash: md5
+      md5: f0b806eebca2f3ddf49d0ff821856b45
+      path: data/experiments/predict/en-court-instruct/Unsloth-Llama-3-8B-Instruct-fine-tuned-en/outputs_7312.json
+      size: 670935
+    - hash: md5
+      md5: 0644efb76af2c5461185e37a07ba2c17
+      path: scripts/sft/evaluate.py
+      size: 697
     outs:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Unsloth-Mistral-7B-Instruct-v0.3-fine-tuned/outputs_997.json
-      hash: md5
-      md5: 55d682fba1c08c68552e98be6b503b4e
-      size: 1790731
-  predict@Unsloth-Mistral-7B-Instruct-v0.3-fine-tuned-42:
-    cmd: PYTHONPATH=. python scripts/sft/predict.py model=Unsloth-Mistral-7B-Instruct-v0.3-fine-tuned
-      random_seed=42 
-      output_file=data/experiments/predict/pl-court-instruct/Unsloth-Mistral-7B-Instruct-v0.3-fine-tuned/outputs_42.json
+    - hash: md5
+      md5: a8459393feb773fea85ede4b831b3fa6
+      path: data/experiments/predict/en-court-instruct/Unsloth-Llama-3-8B-Instruct-fine-tuned-en/metrics_7312.json
+      size: 157
+  evaluate_en@en-court-instruct-Unsloth-Llama-3-8B-Instruct-fine-tuned-en-997:
+    cmd: PYTHONPATH=. python scripts/sft/evaluate.py --output-file data/experiments/predict/en-court-instruct/Unsloth-Llama-3-8B-Instruct-fine-tuned-en/outputs_997.json
+      --num-proc=-1
     deps:
-    - path: configs/model/Unsloth-Mistral-7B-Instruct-v0.3-fine-tuned.yaml
-      hash: md5
-      md5: 828382dc03dbed80cff4a3358321dc4a
-      size: 271
-    - path: configs/predict.yaml
-      hash: md5
-      md5: 5fc8b9ac571d4a2209d7d866697252ab
-      size: 402
-    - path: scripts/sft/predict.py
-      hash: md5
-      md5: f9acd63cd4d682ae2242d7b51f0d974b
-      size: 3198
+    - hash: md5
+      md5: 4e968cac351ad48ad786d1ecccbbc967
+      path: data/experiments/predict/en-court-instruct/Unsloth-Llama-3-8B-Instruct-fine-tuned-en/outputs_997.json
+      size: 670674
+    - hash: md5
+      md5: 0644efb76af2c5461185e37a07ba2c17
+      path: scripts/sft/evaluate.py
+      size: 697
     outs:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Unsloth-Mistral-7B-Instruct-v0.3-fine-tuned/outputs_42.json
-      hash: md5
-      md5: 156091297490d893f9815d2ffcf17cbf
-      size: 1792160
-  predict@Unsloth-Mistral-7B-Instruct-v0.3-7312:
-    cmd: PYTHONPATH=. python scripts/sft/predict.py model=Unsloth-Mistral-7B-Instruct-v0.3
-      random_seed=7312 
-      output_file=data/experiments/predict/pl-court-instruct/Unsloth-Mistral-7B-Instruct-v0.3/outputs_7312.json
+    - hash: md5
+      md5: 21bc79aad7ab2e97b75e1d3fb18a2263
+      path: data/experiments/predict/en-court-instruct/Unsloth-Llama-3-8B-Instruct-fine-tuned-en/metrics_997.json
+      size: 157
+  evaluate_en@en-court-instruct-Unsloth-Mistral-Nemo-Instruct-2407-42:
+    cmd: PYTHONPATH=. python scripts/sft/evaluate.py --output-file data/experiments/predict/en-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/outputs_42.json
+      --num-proc=-1
     deps:
-    - path: configs/model/Unsloth-Mistral-7B-Instruct-v0.3.yaml
-      hash: md5
-      md5: d184e20107315876e7751bdc7c3841ad
-      size: 182
-    - path: configs/predict.yaml
-      hash: md5
-      md5: 5fc8b9ac571d4a2209d7d866697252ab
-      size: 402
-    - path: scripts/sft/predict.py
-      hash: md5
-      md5: f9acd63cd4d682ae2242d7b51f0d974b
-      size: 3198
+    - hash: md5
+      md5: 4fe25ad80a20ea5d6200136176b3e4ca
+      path: data/experiments/predict/en-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/outputs_42.json
+      size: 705218
+    - hash: md5
+      md5: 0644efb76af2c5461185e37a07ba2c17
+      path: scripts/sft/evaluate.py
+      size: 697
     outs:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Unsloth-Mistral-7B-Instruct-v0.3/outputs_7312.json
-      hash: md5
-      md5: 77e10dd2ec17e12e171e4bcab1a48e08
-      size: 1795629
-  evaluate@Unsloth-Mistral-7B-Instruct-v0.3-fine-tuned-997:
-    cmd: PYTHONPATH=. python scripts/sft/evaluate.py --output-file 
-      data/experiments/predict/pl-court-instruct/Unsloth-Mistral-7B-Instruct-v0.3-fine-tuned/outputs_997.json
+    - hash: md5
+      md5: 0b2f663a1cbc3ef08c363ec8adc53c15
+      path: data/experiments/predict/en-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/metrics_42.json
+      size: 151
+  evaluate_en@en-court-instruct-Unsloth-Mistral-Nemo-Instruct-2407-7312:
+    cmd: PYTHONPATH=. python scripts/sft/evaluate.py --output-file data/experiments/predict/en-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/outputs_7312.json
+      --num-proc=-1
     deps:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Unsloth-Mistral-7B-Instruct-v0.3-fine-tuned/outputs_997.json
-      hash: md5
-      md5: 55d682fba1c08c68552e98be6b503b4e
-      size: 1790731
-    - path: scripts/sft/evaluate.py
-      hash: md5
-      md5: 73aa4a7eb8a035c087702457b9401654
-      size: 636
-    outs:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Unsloth-Mistral-7B-Instruct-v0.3-fine-tuned/metrics_997.json
-      hash: md5
-      md5: 29bf759169190a4591c2da7de5399b92
-      size: 306
-  evaluate@Unsloth-Mistral-7B-Instruct-v0.3-fine-tuned-7312:
-    cmd: PYTHONPATH=. python scripts/sft/evaluate.py --output-file 
-      data/experiments/predict/pl-court-instruct/Unsloth-Mistral-7B-Instruct-v0.3-fine-tuned/outputs_7312.json
+    - hash: md5
+      md5: cf4fdbf0e26e6c793bdca4edd6e365c0
+      path: data/experiments/predict/en-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/outputs_7312.json
+      size: 703876
+    - hash: md5
+      md5: 0644efb76af2c5461185e37a07ba2c17
+      path: scripts/sft/evaluate.py
+      size: 697
+    outs:
+    - hash: md5
+      md5: 604b5cee14ec6520b88bafecc962e031
+      path: data/experiments/predict/en-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/metrics_7312.json
+      size: 152
+  evaluate_en@en-court-instruct-Unsloth-Mistral-Nemo-Instruct-2407-997:
+    cmd: PYTHONPATH=. python scripts/sft/evaluate.py --output-file data/experiments/predict/en-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/outputs_997.json
+      --num-proc=-1
     deps:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Unsloth-Mistral-7B-Instruct-v0.3-fine-tuned/outputs_7312.json
-      hash: md5
-      md5: 3d336675e54a706fae45349adbaf6ee4
-      size: 1793461
-    - path: scripts/sft/evaluate.py
-      hash: md5
-      md5: 73aa4a7eb8a035c087702457b9401654
-      size: 636
-    outs:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Unsloth-Mistral-7B-Instruct-v0.3-fine-tuned/metrics_7312.json
-      hash: md5
-      md5: d1d4407aa0d04db49591afede0d5e71c
-      size: 307
-  evaluate@Unsloth-Mistral-7B-Instruct-v0.3-997:
-    cmd: PYTHONPATH=. python scripts/sft/evaluate.py --output-file 
-      data/experiments/predict/pl-court-instruct/Unsloth-Mistral-7B-Instruct-v0.3/outputs_997.json
+    - hash: md5
+      md5: 94c30cf8fe7db71afc58a5c9cdbc0d9f
+      path: data/experiments/predict/en-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/outputs_997.json
+      size: 705894
+    - hash: md5
+      md5: 0644efb76af2c5461185e37a07ba2c17
+      path: scripts/sft/evaluate.py
+      size: 697
+    outs:
+    - hash: md5
+      md5: a91ec5b434bebd8ce1d2000e0a033cb9
+      path: data/experiments/predict/en-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/metrics_997.json
+      size: 152
+  evaluate_en@en-court-instruct-Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned-en-42:
+    cmd: PYTHONPATH=. python scripts/sft/evaluate.py --output-file data/experiments/predict/en-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned-en/outputs_42.json
+      --num-proc=-1
     deps:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Unsloth-Mistral-7B-Instruct-v0.3/outputs_997.json
-      hash: md5
-      md5: bbb883aa388b274bef3e9296df26f68f
-      size: 1795752
-    - path: scripts/sft/evaluate.py
-      hash: md5
-      md5: 73aa4a7eb8a035c087702457b9401654
-      size: 636
-    outs:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Unsloth-Mistral-7B-Instruct-v0.3/metrics_997.json
-      hash: md5
-      md5: 45085589a6e88e04d4e01ebf5d3e9bcc
-      size: 310
-  evaluate@Unsloth-Mistral-7B-Instruct-v0.3-fine-tuned-42:
-    cmd: PYTHONPATH=. python scripts/sft/evaluate.py --output-file 
-      data/experiments/predict/pl-court-instruct/Unsloth-Mistral-7B-Instruct-v0.3-fine-tuned/outputs_42.json
+    - hash: md5
+      md5: 313fa5a662f37cacae4980a04830f422
+      path: data/experiments/predict/en-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned-en/outputs_42.json
+      size: 642688
+    - hash: md5
+      md5: 0644efb76af2c5461185e37a07ba2c17
+      path: scripts/sft/evaluate.py
+      size: 697
+    outs:
+    - hash: md5
+      md5: f0d37c5ac017c0e488b7c3bed01c7093
+      path: data/experiments/predict/en-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned-en/metrics_42.json
+      size: 156
+  evaluate_en@en-court-instruct-Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned-en-7312:
+    cmd: PYTHONPATH=. python scripts/sft/evaluate.py --output-file data/experiments/predict/en-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned-en/outputs_7312.json
+      --num-proc=-1
     deps:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Unsloth-Mistral-7B-Instruct-v0.3-fine-tuned/outputs_42.json
-      hash: md5
-      md5: 156091297490d893f9815d2ffcf17cbf
-      size: 1792160
-    - path: scripts/sft/evaluate.py
-      hash: md5
-      md5: 73aa4a7eb8a035c087702457b9401654
-      size: 636
-    outs:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Unsloth-Mistral-7B-Instruct-v0.3-fine-tuned/metrics_42.json
-      hash: md5
-      md5: d6661078ff04e0791f7d5dae2e5ed99d
-      size: 306
-  evaluate@Unsloth-Mistral-7B-Instruct-v0.3-42:
-    cmd: PYTHONPATH=. python scripts/sft/evaluate.py --output-file 
-      data/experiments/predict/pl-court-instruct/Unsloth-Mistral-7B-Instruct-v0.3/outputs_42.json
+    - hash: md5
+      md5: 4ed8db93aa14f1cc98e276d3989efa9e
+      path: data/experiments/predict/en-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned-en/outputs_7312.json
+      size: 642730
+    - hash: md5
+      md5: 0644efb76af2c5461185e37a07ba2c17
+      path: scripts/sft/evaluate.py
+      size: 697
+    outs:
+    - hash: md5
+      md5: a1521ab06a56258759953bb02ae87e24
+      path: data/experiments/predict/en-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned-en/metrics_7312.json
+      size: 157
+  evaluate_en@en-court-instruct-Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned-en-997:
+    cmd: PYTHONPATH=. python scripts/sft/evaluate.py --output-file data/experiments/predict/en-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned-en/outputs_997.json
+      --num-proc=-1
     deps:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Unsloth-Mistral-7B-Instruct-v0.3/outputs_42.json
-      hash: md5
-      md5: 4a9d3a2bb1dd47a732bd2df8102bc93f
-      size: 1799957
-    - path: scripts/sft/evaluate.py
-      hash: md5
-      md5: 73aa4a7eb8a035c087702457b9401654
-      size: 636
-    outs:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Unsloth-Mistral-7B-Instruct-v0.3/metrics_42.json
-      hash: md5
-      md5: c8b1c6a7c1c7b593d7555d38174685b7
-      size: 308
-  evaluate@Unsloth-Mistral-7B-Instruct-v0.3-7312:
-    cmd: PYTHONPATH=. python scripts/sft/evaluate.py --output-file 
-      data/experiments/predict/pl-court-instruct/Unsloth-Mistral-7B-Instruct-v0.3/outputs_7312.json
-    deps:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Unsloth-Mistral-7B-Instruct-v0.3/outputs_7312.json
-      hash: md5
-      md5: 77e10dd2ec17e12e171e4bcab1a48e08
-      size: 1795629
-    - path: scripts/sft/evaluate.py
-      hash: md5
-      md5: 73aa4a7eb8a035c087702457b9401654
-      size: 636
-    outs:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Unsloth-Mistral-7B-Instruct-v0.3/metrics_7312.json
-      hash: md5
-      md5: 021edec6c9f831f8e6abe15d9771ac1e
-      size: 307
-  evaluate_llm_as_judge@llama_3.1_8b_instruct-Unsloth-Mistral-7B-Instruct-v0.3-997:
-    cmd: PYTHONPATH=. python scripts/sft/evaluate_llm_as_judge.py api_model=llama_3.1_8b_instruct
-      answers_file=data/experiments/predict/pl-court-instruct/Unsloth-Mistral-7B-Instruct-v0.3/outputs_997.json
-      out_metric_file=data/experiments/predict/pl-court-instruct/Unsloth-Mistral-7B-Instruct-v0.3/judge_metrics_997.json
-    deps:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Unsloth-Mistral-7B-Instruct-v0.3/outputs_997.json
-      hash: md5
-      md5: bbb883aa388b274bef3e9296df26f68f
-      size: 1795752
-    - path: scripts/sft/evaluate_llm_as_judge.py
-      hash: md5
-      md5: 416d51f2597f86e69bed8d510553decd
-      size: 2049
-    outs:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Unsloth-Mistral-7B-Instruct-v0.3/judge_metrics_997.json
-      hash: md5
-      md5: a1fac753d33cf460dbfc64eeb7e1c89b
-      size: 972
-  predict@Unsloth-Llama-3-8B-Instruct-42:
-    cmd: PYTHONPATH=. python scripts/sft/predict.py model=Unsloth-Llama-3-8B-Instruct
-      random_seed=42 
-      output_file=data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct/outputs_42.json
-    deps:
-    - path: configs/model/Unsloth-Llama-3-8B-Instruct.yaml
-      hash: md5
-      md5: 56a95874b3e77e7ffec11c00330da5b6
-      size: 176
-    - path: configs/predict.yaml
-      hash: md5
-      md5: 5fc8b9ac571d4a2209d7d866697252ab
-      size: 402
-    - path: scripts/sft/predict.py
-      hash: md5
-      md5: f9acd63cd4d682ae2242d7b51f0d974b
-      size: 3198
-    outs:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct/outputs_42.json
-      hash: md5
-      md5: e99c88720116c951087b6125e5f4be4d
-      size: 2008073
-  predict@Unsloth-Llama-3-8B-Instruct-7312:
-    cmd: PYTHONPATH=. python scripts/sft/predict.py model=Unsloth-Llama-3-8B-Instruct
-      random_seed=7312 
-      output_file=data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct/outputs_7312.json
-    deps:
-    - path: configs/model/Unsloth-Llama-3-8B-Instruct.yaml
-      hash: md5
-      md5: 56a95874b3e77e7ffec11c00330da5b6
-      size: 176
-    - path: configs/predict.yaml
-      hash: md5
-      md5: 5fc8b9ac571d4a2209d7d866697252ab
-      size: 402
-    - path: scripts/sft/predict.py
-      hash: md5
-      md5: f9acd63cd4d682ae2242d7b51f0d974b
-      size: 3198
-    outs:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct/outputs_7312.json
-      hash: md5
-      md5: 4c25368aacb7402b1b2cae9368d187d1
-      size: 2013637
-  predict@Unsloth-Llama-3-8B-Instruct-fine-tuned-42:
-    cmd: PYTHONPATH=. python scripts/sft/predict.py model=Unsloth-Llama-3-8B-Instruct-fine-tuned
-      random_seed=42 
-      output_file=data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct-fine-tuned/outputs_42.json
-    deps:
-    - path: configs/model/Unsloth-Llama-3-8B-Instruct-fine-tuned.yaml
-      hash: md5
-      md5: b102e42e63586df07d9528d70f802b8f
-      size: 260
-    - path: configs/predict.yaml
-      hash: md5
-      md5: 5fc8b9ac571d4a2209d7d866697252ab
-      size: 402
-    - path: scripts/sft/predict.py
-      hash: md5
-      md5: f9acd63cd4d682ae2242d7b51f0d974b
-      size: 3198
-    outs:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct-fine-tuned/outputs_42.json
-      hash: md5
-      md5: 289b719e8c7166e578417e5706bdc4e3
-      size: 1760355
-  predict@Unsloth-Llama-3-8B-Instruct-997:
-    cmd: PYTHONPATH=. python scripts/sft/predict.py model=Unsloth-Llama-3-8B-Instruct
-      random_seed=997 
-      output_file=data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct/outputs_997.json
-    deps:
-    - path: configs/model/Unsloth-Llama-3-8B-Instruct.yaml
-      hash: md5
-      md5: 56a95874b3e77e7ffec11c00330da5b6
-      size: 176
-    - path: configs/predict.yaml
-      hash: md5
-      md5: 5fc8b9ac571d4a2209d7d866697252ab
-      size: 402
-    - path: scripts/sft/predict.py
-      hash: md5
-      md5: f9acd63cd4d682ae2242d7b51f0d974b
-      size: 3198
-    outs:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct/outputs_997.json
-      hash: md5
-      md5: baef589507248af212aaae51602fd999
-      size: 2010150
-  predict@Unsloth-Llama-3-8B-Instruct-fine-tuned-7312:
-    cmd: PYTHONPATH=. python scripts/sft/predict.py model=Unsloth-Llama-3-8B-Instruct-fine-tuned
-      random_seed=7312 
-      output_file=data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct-fine-tuned/outputs_7312.json
-    deps:
-    - path: configs/model/Unsloth-Llama-3-8B-Instruct-fine-tuned.yaml
-      hash: md5
-      md5: b102e42e63586df07d9528d70f802b8f
-      size: 260
-    - path: configs/predict.yaml
-      hash: md5
-      md5: 5fc8b9ac571d4a2209d7d866697252ab
-      size: 402
-    - path: scripts/sft/predict.py
-      hash: md5
-      md5: f9acd63cd4d682ae2242d7b51f0d974b
-      size: 3198
-    outs:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct-fine-tuned/outputs_7312.json
-      hash: md5
-      md5: 25bee3b4ee09b36d636095b4c927a0d3
-      size: 1759194
-  predict@Unsloth-Llama-3-8B-Instruct-fine-tuned-997:
-    cmd: PYTHONPATH=. python scripts/sft/predict.py model=Unsloth-Llama-3-8B-Instruct-fine-tuned
-      random_seed=997 
-      output_file=data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct-fine-tuned/outputs_997.json
-    deps:
-    - path: configs/model/Unsloth-Llama-3-8B-Instruct-fine-tuned.yaml
-      hash: md5
-      md5: b102e42e63586df07d9528d70f802b8f
-      size: 260
-    - path: configs/predict.yaml
-      hash: md5
-      md5: 5fc8b9ac571d4a2209d7d866697252ab
-      size: 402
-    - path: scripts/sft/predict.py
-      hash: md5
-      md5: f9acd63cd4d682ae2242d7b51f0d974b
-      size: 3198
-    outs:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct-fine-tuned/outputs_997.json
-      hash: md5
-      md5: 82b2c535d99d91b9a34986375bfa31a9
-      size: 1758747
-  evaluate@Unsloth-Llama-3-8B-Instruct-42:
-    cmd: PYTHONPATH=. python scripts/sft/evaluate.py --output-file 
-      data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct/outputs_42.json
-    deps:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct/outputs_42.json
-      hash: md5
-      md5: e99c88720116c951087b6125e5f4be4d
-      size: 2008073
-    - path: scripts/sft/evaluate.py
-      hash: md5
-      md5: 73aa4a7eb8a035c087702457b9401654
-      size: 636
-    outs:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct/metrics_42.json
-      hash: md5
-      md5: 2116481b79c785f94b35852b6e0e4f57
-      size: 304
-  evaluate@Unsloth-Llama-3-8B-Instruct-7312:
-    cmd: PYTHONPATH=. python scripts/sft/evaluate.py --output-file 
-      data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct/outputs_7312.json
-    deps:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct/outputs_7312.json
-      hash: md5
-      md5: 4c25368aacb7402b1b2cae9368d187d1
-      size: 2013637
-    - path: scripts/sft/evaluate.py
-      hash: md5
-      md5: 73aa4a7eb8a035c087702457b9401654
-      size: 636
-    outs:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct/metrics_7312.json
-      hash: md5
-      md5: ae6eebc7a3538e73e6ae213435c3a875
-      size: 307
-  evaluate@Unsloth-Llama-3-8B-Instruct-997:
-    cmd: PYTHONPATH=. python scripts/sft/evaluate.py --output-file 
-      data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct/outputs_997.json
-    deps:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct/outputs_997.json
-      hash: md5
-      md5: baef589507248af212aaae51602fd999
-      size: 2010150
-    - path: scripts/sft/evaluate.py
-      hash: md5
-      md5: 73aa4a7eb8a035c087702457b9401654
-      size: 636
-    outs:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct/metrics_997.json
-      hash: md5
-      md5: b2edea153ccd1c3a0b1e22699330de4d
-      size: 306
-  evaluate@Unsloth-Llama-3-8B-Instruct-fine-tuned-42:
-    cmd: PYTHONPATH=. python scripts/sft/evaluate.py --output-file 
-      data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct-fine-tuned/outputs_42.json
-    deps:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct-fine-tuned/outputs_42.json
-      hash: md5
-      md5: 289b719e8c7166e578417e5706bdc4e3
-      size: 1760355
-    - path: scripts/sft/evaluate.py
-      hash: md5
-      md5: 73aa4a7eb8a035c087702457b9401654
-      size: 636
-    outs:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct-fine-tuned/metrics_42.json
-      hash: md5
-      md5: c27dac02aceaecc8ead3cb49ed2ea22d
-      size: 306
-  evaluate@Unsloth-Llama-3-8B-Instruct-fine-tuned-7312:
-    cmd: PYTHONPATH=. python scripts/sft/evaluate.py --output-file 
-      data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct-fine-tuned/outputs_7312.json
-    deps:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct-fine-tuned/outputs_7312.json
-      hash: md5
-      md5: 25bee3b4ee09b36d636095b4c927a0d3
-      size: 1759194
-    - path: scripts/sft/evaluate.py
-      hash: md5
-      md5: 73aa4a7eb8a035c087702457b9401654
-      size: 636
-    outs:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct-fine-tuned/metrics_7312.json
-      hash: md5
-      md5: 59ec16d58c706877808d77e53429bd35
-      size: 306
-  evaluate@Unsloth-Llama-3-8B-Instruct-fine-tuned-997:
-    cmd: PYTHONPATH=. python scripts/sft/evaluate.py --output-file 
-      data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct-fine-tuned/outputs_997.json
-    deps:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct-fine-tuned/outputs_997.json
-      hash: md5
-      md5: 82b2c535d99d91b9a34986375bfa31a9
-      size: 1758747
-    - path: scripts/sft/evaluate.py
-      hash: md5
-      md5: 73aa4a7eb8a035c087702457b9401654
-      size: 636
-    outs:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct-fine-tuned/metrics_997.json
-      hash: md5
-      md5: b0f204e659b487ff84a736792f4c0344
-      size: 302
-  evaluate_llm_as_judge@llama_3.1_8b_instruct-Unsloth-Mistral-7B-Instruct-v0.3-fine-tuned-997:
-    cmd: PYTHONPATH=. python scripts/sft/evaluate_llm_as_judge.py api_model=llama_3.1_8b_instruct
-      answers_file=data/experiments/predict/pl-court-instruct/Unsloth-Mistral-7B-Instruct-v0.3-fine-tuned/outputs_997.json
-      out_metric_file=data/experiments/predict/pl-court-instruct/Unsloth-Mistral-7B-Instruct-v0.3-fine-tuned/judge_metrics_997.json
-    deps:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Unsloth-Mistral-7B-Instruct-v0.3-fine-tuned/outputs_997.json
-      hash: md5
-      md5: 55d682fba1c08c68552e98be6b503b4e
-      size: 1790731
-    - path: scripts/sft/evaluate_llm_as_judge.py
-      hash: md5
-      md5: 416d51f2597f86e69bed8d510553decd
-      size: 2049
-    outs:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Unsloth-Mistral-7B-Instruct-v0.3-fine-tuned/judge_metrics_997.json
-      hash: md5
-      md5: bb73f2ec119c4c16761fcb6feac1d902
-      size: 977
-  evaluate_llm_as_judge@llama_3.1_8b_instruct-Unsloth-Llama-3-8B-Instruct-42:
-    cmd: PYTHONPATH=. python scripts/sft/evaluate_llm_as_judge.py api_model=llama_3.1_8b_instruct
-      answers_file=data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct/outputs_42.json
-      out_metric_file=data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct/judge_metrics_42.json
-    deps:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct/outputs_42.json
-      hash: md5
-      md5: e99c88720116c951087b6125e5f4be4d
-      size: 2008073
-    - path: scripts/sft/evaluate_llm_as_judge.py
-      hash: md5
-      md5: 416d51f2597f86e69bed8d510553decd
-      size: 2049
-    outs:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct/judge_metrics_42.json
-      hash: md5
-      md5: 1fbffe7226913b6fdd354ef9ea980c7f
-      size: 1177
-  evaluate_llm_as_judge@llama_3.1_8b_instruct-Unsloth-Llama-3-8B-Instruct-997:
-    cmd: PYTHONPATH=. python scripts/sft/evaluate_llm_as_judge.py api_model=llama_3.1_8b_instruct
-      answers_file=data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct/outputs_997.json
-      out_metric_file=data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct/judge_metrics_997.json
-    deps:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct/outputs_997.json
-      hash: md5
-      md5: baef589507248af212aaae51602fd999
-      size: 2010150
-    - path: scripts/sft/evaluate_llm_as_judge.py
-      hash: md5
-      md5: 416d51f2597f86e69bed8d510553decd
-      size: 2049
-    outs:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct/judge_metrics_997.json
-      hash: md5
-      md5: 69d98bbed47b4f880ade8adc2987354c
-      size: 1171
-  evaluate_llm_as_judge@llama_3.1_8b_instruct-Unsloth-Mistral-7B-Instruct-v0.3-42:
-    cmd: PYTHONPATH=. python scripts/sft/evaluate_llm_as_judge.py api_model=llama_3.1_8b_instruct
-      answers_file=data/experiments/predict/pl-court-instruct/Unsloth-Mistral-7B-Instruct-v0.3/outputs_42.json
-      out_metric_file=data/experiments/predict/pl-court-instruct/Unsloth-Mistral-7B-Instruct-v0.3/judge_metrics_42.json
-    deps:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Unsloth-Mistral-7B-Instruct-v0.3/outputs_42.json
-      hash: md5
-      md5: 4a9d3a2bb1dd47a732bd2df8102bc93f
-      size: 1799957
-    - path: scripts/sft/evaluate_llm_as_judge.py
-      hash: md5
-      md5: 416d51f2597f86e69bed8d510553decd
-      size: 2049
-    outs:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Unsloth-Mistral-7B-Instruct-v0.3/judge_metrics_42.json
-      hash: md5
-      md5: faee13bd2b0a1006140692559925b722
-      size: 995
-  evaluate_llm_as_judge@llama_3.1_8b_instruct-Unsloth-Mistral-7B-Instruct-v0.3-fine-tuned-42:
-    cmd: PYTHONPATH=. python scripts/sft/evaluate_llm_as_judge.py api_model=llama_3.1_8b_instruct
-      answers_file=data/experiments/predict/pl-court-instruct/Unsloth-Mistral-7B-Instruct-v0.3-fine-tuned/outputs_42.json
-      out_metric_file=data/experiments/predict/pl-court-instruct/Unsloth-Mistral-7B-Instruct-v0.3-fine-tuned/judge_metrics_42.json
-    deps:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Unsloth-Mistral-7B-Instruct-v0.3-fine-tuned/outputs_42.json
-      hash: md5
-      md5: 156091297490d893f9815d2ffcf17cbf
-      size: 1792160
-    - path: scripts/sft/evaluate_llm_as_judge.py
-      hash: md5
-      md5: 416d51f2597f86e69bed8d510553decd
-      size: 2049
-    outs:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Unsloth-Mistral-7B-Instruct-v0.3-fine-tuned/judge_metrics_42.json
-      hash: md5
-      md5: 26f88b69c0756f4b2021b2855112e702
-      size: 985
-  evaluate_llm_as_judge@llama_3.1_8b_instruct-Unsloth-Mistral-7B-Instruct-v0.3-fine-tuned-7312:
-    cmd: PYTHONPATH=. python scripts/sft/evaluate_llm_as_judge.py api_model=llama_3.1_8b_instruct
-      answers_file=data/experiments/predict/pl-court-instruct/Unsloth-Mistral-7B-Instruct-v0.3-fine-tuned/outputs_7312.json
-      out_metric_file=data/experiments/predict/pl-court-instruct/Unsloth-Mistral-7B-Instruct-v0.3-fine-tuned/judge_metrics_7312.json
-    deps:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Unsloth-Mistral-7B-Instruct-v0.3-fine-tuned/outputs_7312.json
-      hash: md5
-      md5: 3d336675e54a706fae45349adbaf6ee4
-      size: 1793461
-    - path: scripts/sft/evaluate_llm_as_judge.py
-      hash: md5
-      md5: 416d51f2597f86e69bed8d510553decd
-      size: 2049
-    outs:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Unsloth-Mistral-7B-Instruct-v0.3-fine-tuned/judge_metrics_7312.json
-      hash: md5
-      md5: ceb3859ac3ccf62a7c4210f97489ccfe
-      size: 978
-  evaluate_llm_as_judge@llama_3.1_8b_instruct-Unsloth-Llama-3-8B-Instruct-fine-tuned-7312:
-    cmd: PYTHONPATH=. python scripts/sft/evaluate_llm_as_judge.py api_model=llama_3.1_8b_instruct
-      answers_file=data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct-fine-tuned/outputs_7312.json
-      out_metric_file=data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct-fine-tuned/judge_metrics_7312.json
-    deps:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct-fine-tuned/outputs_7312.json
-      hash: md5
-      md5: 25bee3b4ee09b36d636095b4c927a0d3
-      size: 1759194
-    - path: scripts/sft/evaluate_llm_as_judge.py
-      hash: md5
-      md5: 416d51f2597f86e69bed8d510553decd
-      size: 2049
-    outs:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct-fine-tuned/judge_metrics_7312.json
-      hash: md5
-      md5: 339ca8662be51697dd8eb2226f4cb506
-      size: 1163
-  evaluate_llm_as_judge@llama_3.1_8b_instruct-Unsloth-Llama-3-8B-Instruct-fine-tuned-42:
-    cmd: PYTHONPATH=. python scripts/sft/evaluate_llm_as_judge.py api_model=llama_3.1_8b_instruct
-      answers_file=data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct-fine-tuned/outputs_42.json
-      out_metric_file=data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct-fine-tuned/judge_metrics_42.json
-    deps:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct-fine-tuned/outputs_42.json
-      hash: md5
-      md5: 289b719e8c7166e578417e5706bdc4e3
-      size: 1760355
-    - path: scripts/sft/evaluate_llm_as_judge.py
-      hash: md5
-      md5: 416d51f2597f86e69bed8d510553decd
-      size: 2049
-    outs:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct-fine-tuned/judge_metrics_42.json
-      hash: md5
-      md5: 1fb63c11961122a7b5cebe2882d9a59b
-      size: 1166
-  evaluate_llm_as_judge@llama_3.1_8b_instruct-Unsloth-Llama-3-8B-Instruct-fine-tuned-997:
-    cmd: PYTHONPATH=. python scripts/sft/evaluate_llm_as_judge.py api_model=llama_3.1_8b_instruct
-      answers_file=data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct-fine-tuned/outputs_997.json
-      out_metric_file=data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct-fine-tuned/judge_metrics_997.json
-    deps:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct-fine-tuned/outputs_997.json
-      hash: md5
-      md5: 82b2c535d99d91b9a34986375bfa31a9
-      size: 1758747
-    - path: scripts/sft/evaluate_llm_as_judge.py
-      hash: md5
-      md5: 416d51f2597f86e69bed8d510553decd
-      size: 2049
-    outs:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct-fine-tuned/judge_metrics_997.json
-      hash: md5
-      md5: 08426af15d5a278bb2d5bb9a5ded9449
-      size: 1168
-  evaluate_llm_as_judge@llama_3.1_8b_instruct-Unsloth-Mistral-7B-Instruct-v0.3-7312:
-    cmd: PYTHONPATH=. python scripts/sft/evaluate_llm_as_judge.py api_model=llama_3.1_8b_instruct
-      answers_file=data/experiments/predict/pl-court-instruct/Unsloth-Mistral-7B-Instruct-v0.3/outputs_7312.json
-      out_metric_file=data/experiments/predict/pl-court-instruct/Unsloth-Mistral-7B-Instruct-v0.3/judge_metrics_7312.json
-    deps:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Unsloth-Mistral-7B-Instruct-v0.3/outputs_7312.json
-      hash: md5
-      md5: 77e10dd2ec17e12e171e4bcab1a48e08
-      size: 1795629
-    - path: scripts/sft/evaluate_llm_as_judge.py
-      hash: md5
-      md5: 416d51f2597f86e69bed8d510553decd
-      size: 2049
-    outs:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Unsloth-Mistral-7B-Instruct-v0.3/judge_metrics_7312.json
-      hash: md5
-      md5: f1d2dd11fd7a8f0bac451d50f21ae7f7
-      size: 995
-  evaluate_llm_as_judge@llama_3.1_8b_instruct-Unsloth-Llama-3-8B-Instruct-7312:
-    cmd: PYTHONPATH=. python scripts/sft/evaluate_llm_as_judge.py api_model=llama_3.1_8b_instruct
-      answers_file=data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct/outputs_7312.json
-      out_metric_file=data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct/judge_metrics_7312.json
-    deps:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct/outputs_7312.json
-      hash: md5
-      md5: 4c25368aacb7402b1b2cae9368d187d1
-      size: 2013637
-    - path: scripts/sft/evaluate_llm_as_judge.py
-      hash: md5
-      md5: 416d51f2597f86e69bed8d510553decd
-      size: 2049
-    outs:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct/judge_metrics_7312.json
-      hash: md5
-      md5: cc503c0e3b3a10390457ff99e46463d0
-      size: 1176
-  predict@Unsloth-Mistral-Nemo-Instruct-2407-7312:
-    cmd: PYTHONPATH=. python scripts/sft/predict.py model=Unsloth-Mistral-Nemo-Instruct-2407
-      random_seed=7312 
-      output_file=data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/outputs_7312.json
-    deps:
-    - path: configs/model/Unsloth-Mistral-Nemo-Instruct-2407.yaml
-      hash: md5
-      md5: ca5ac52e503c9f488f98f569811c76dc
-      size: 261
-    - path: configs/predict.yaml
-      hash: md5
-      md5: 5fc8b9ac571d4a2209d7d866697252ab
-      size: 402
-    - path: scripts/sft/predict.py
-      hash: md5
-      md5: f9acd63cd4d682ae2242d7b51f0d974b
-      size: 3198
-    outs:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/outputs_7312.json
-      hash: md5
-      md5: 924744efce1483e9128579cad7a4454c
-      size: 1748772
-  predict@Unsloth-Mistral-Nemo-Instruct-2407-42:
-    cmd: PYTHONPATH=. python scripts/sft/predict.py model=Unsloth-Mistral-Nemo-Instruct-2407
-      random_seed=42 
-      output_file=data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/outputs_42.json
-    deps:
-    - path: configs/model/Unsloth-Mistral-Nemo-Instruct-2407.yaml
-      hash: md5
-      md5: ca5ac52e503c9f488f98f569811c76dc
-      size: 261
-    - path: configs/predict.yaml
-      hash: md5
-      md5: 5fc8b9ac571d4a2209d7d866697252ab
-      size: 402
-    - path: scripts/sft/predict.py
-      hash: md5
-      md5: f9acd63cd4d682ae2242d7b51f0d974b
-      size: 3198
-    outs:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/outputs_42.json
-      hash: md5
-      md5: 1385f49966e9db2a88a17f53d0887ad8
-      size: 1741944
-  predict@Unsloth-Mistral-Nemo-Instruct-2407-997:
-    cmd: PYTHONPATH=. python scripts/sft/predict.py model=Unsloth-Mistral-Nemo-Instruct-2407
-      random_seed=997 
-      output_file=data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/outputs_997.json
-    deps:
-    - path: configs/model/Unsloth-Mistral-Nemo-Instruct-2407.yaml
-      hash: md5
-      md5: ca5ac52e503c9f488f98f569811c76dc
-      size: 261
-    - path: configs/predict.yaml
-      hash: md5
-      md5: 5fc8b9ac571d4a2209d7d866697252ab
-      size: 402
-    - path: scripts/sft/predict.py
-      hash: md5
-      md5: f9acd63cd4d682ae2242d7b51f0d974b
-      size: 3198
-    outs:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/outputs_997.json
-      hash: md5
-      md5: 4d023797a9053fd7df61f6b1796112e9
-      size: 1747404
-  evaluate@Unsloth-Mistral-Nemo-Instruct-2407-997:
-    cmd: PYTHONPATH=. python scripts/sft/evaluate.py --output-file 
-      data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/outputs_997.json
-    deps:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/outputs_997.json
-      hash: md5
-      md5: 4d023797a9053fd7df61f6b1796112e9
-      size: 1747404
-    - path: scripts/sft/evaluate.py
-      hash: md5
-      md5: 73aa4a7eb8a035c087702457b9401654
-      size: 636
-    outs:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/metrics_997.json
-      hash: md5
-      md5: 122c6d789f343d4e7232d8720b0f577a
-      size: 303
-  evaluate@Unsloth-Mistral-Nemo-Instruct-2407-7312:
-    cmd: PYTHONPATH=. python scripts/sft/evaluate.py --output-file 
-      data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/outputs_7312.json
-    deps:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/outputs_7312.json
-      hash: md5
-      md5: 924744efce1483e9128579cad7a4454c
-      size: 1748772
-    - path: scripts/sft/evaluate.py
-      hash: md5
-      md5: 73aa4a7eb8a035c087702457b9401654
-      size: 636
-    outs:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/metrics_7312.json
-      hash: md5
-      md5: 2ea9afa67cf34dc1629a265bce2c3357
-      size: 306
-  evaluate@Unsloth-Mistral-Nemo-Instruct-2407-42:
-    cmd: PYTHONPATH=. python scripts/sft/evaluate.py --output-file 
-      data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/outputs_42.json
-    deps:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/outputs_42.json
-      hash: md5
-      md5: 1385f49966e9db2a88a17f53d0887ad8
-      size: 1741944
-    - path: scripts/sft/evaluate.py
-      hash: md5
-      md5: 73aa4a7eb8a035c087702457b9401654
-      size: 636
-    outs:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/metrics_42.json
-      hash: md5
-      md5: 37165fb96f31997589e95108f6c149a3
-      size: 306
-  predict@Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned-42:
-    cmd: PYTHONPATH=. python scripts/sft/predict.py model=Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned
-      random_seed=42 
-      output_file=data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned/outputs_42.json
-    deps:
-    - path: configs/model/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned.yaml
-      hash: md5
-      md5: a35f354b805d105999797d19d0aa7bde
-      size: 352
-    - path: configs/predict.yaml
-      hash: md5
-      md5: 5fc8b9ac571d4a2209d7d866697252ab
-      size: 402
-    - path: scripts/sft/predict.py
-      hash: md5
-      md5: f9acd63cd4d682ae2242d7b51f0d974b
-      size: 3198
-    outs:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned/outputs_42.json
-      hash: md5
-      md5: 14d4613f7d9495f5fb5f2d7b81f402a9
-      size: 1825646
-  predict@Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned-997:
-    cmd: PYTHONPATH=. python scripts/sft/predict.py model=Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned
-      random_seed=997 
-      output_file=data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned/outputs_997.json
-    deps:
-    - path: configs/model/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned.yaml
-      hash: md5
-      md5: a35f354b805d105999797d19d0aa7bde
-      size: 352
-    - path: configs/predict.yaml
-      hash: md5
-      md5: 5fc8b9ac571d4a2209d7d866697252ab
-      size: 402
-    - path: scripts/sft/predict.py
-      hash: md5
-      md5: f9acd63cd4d682ae2242d7b51f0d974b
-      size: 3198
-    outs:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned/outputs_997.json
-      hash: md5
-      md5: 41a47dc56efc29b6c2771db68bdacb17
-      size: 1822491
-  predict@Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned-7312:
-    cmd: PYTHONPATH=. python scripts/sft/predict.py model=Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned
-      random_seed=7312 
-      output_file=data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned/outputs_7312.json
-    deps:
-    - path: configs/model/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned.yaml
-      hash: md5
-      md5: a35f354b805d105999797d19d0aa7bde
-      size: 352
-    - path: configs/predict.yaml
-      hash: md5
-      md5: 5fc8b9ac571d4a2209d7d866697252ab
-      size: 402
-    - path: scripts/sft/predict.py
-      hash: md5
-      md5: f9acd63cd4d682ae2242d7b51f0d974b
-      size: 3198
-    outs:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned/outputs_7312.json
-      hash: md5
-      md5: 302e1dc4f064007e3df88ac1e8acccc5
-      size: 1831330
-  evaluate@Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned-42:
-    cmd: PYTHONPATH=. python scripts/sft/evaluate.py --output-file 
-      data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned/outputs_42.json
-    deps:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned/outputs_42.json
-      hash: md5
-      md5: 14d4613f7d9495f5fb5f2d7b81f402a9
-      size: 1825646
-    - path: scripts/sft/evaluate.py
-      hash: md5
-      md5: 73aa4a7eb8a035c087702457b9401654
-      size: 636
-    outs:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned/metrics_42.json
-      hash: md5
-      md5: e43e3fe605787204ba1345dedaefd124
-      size: 305
-  evaluate@Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned-7312:
-    cmd: PYTHONPATH=. python scripts/sft/evaluate.py --output-file 
-      data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned/outputs_7312.json
-    deps:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned/outputs_7312.json
-      hash: md5
-      md5: 302e1dc4f064007e3df88ac1e8acccc5
-      size: 1831330
-    - path: scripts/sft/evaluate.py
-      hash: md5
-      md5: 73aa4a7eb8a035c087702457b9401654
-      size: 636
-    outs:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned/metrics_7312.json
-      hash: md5
-      md5: b5d0fa11fe5557bbe1ee9804d5d09cb1
-      size: 303
-  evaluate@Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned-997:
-    cmd: PYTHONPATH=. python scripts/sft/evaluate.py --output-file 
-      data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned/outputs_997.json
-    deps:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned/outputs_997.json
-      hash: md5
-      md5: 41a47dc56efc29b6c2771db68bdacb17
-      size: 1822491
-    - path: scripts/sft/evaluate.py
-      hash: md5
-      md5: 73aa4a7eb8a035c087702457b9401654
-      size: 636
-    outs:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned/metrics_997.json
-      hash: md5
-      md5: 403fc36606ced0ab31d34d3d548f948e
-      size: 303
-  evaluate_llm_as_judge@llama_3.1_8b_instruct-Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned-997:
-    cmd: PYTHONPATH=. python scripts/sft/evaluate_llm_as_judge.py api_model=llama_3.1_8b_instruct
-      answers_file=data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned/outputs_997.json
-      out_metric_file=data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned/judge_metrics_997.json
-    deps:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned/outputs_997.json
-      hash: md5
-      md5: f143807267139bc29888fffb37474f08
-      size: 1733286
-    - path: scripts/sft/evaluate_llm_as_judge.py
-      hash: md5
-      md5: 416d51f2597f86e69bed8d510553decd
-      size: 2049
-    outs:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned/judge_metrics_997.json
-      hash: md5
-      md5: c7aa386de02183e226bc89a99e66e738
-      size: 1168
-  evaluate_llm_as_judge@llama_3.1_8b_instruct-Unsloth-Mistral-Nemo-Instruct-2407-997:
-    cmd: PYTHONPATH=. python scripts/sft/evaluate_llm_as_judge.py api_model=llama_3.1_8b_instruct
-      answers_file=data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/outputs_997.json
-      out_metric_file=data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/judge_metrics_997.json
-    deps:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/outputs_997.json
-      hash: md5
-      md5: 4d023797a9053fd7df61f6b1796112e9
-      size: 1747404
-    - path: scripts/sft/evaluate_llm_as_judge.py
-      hash: md5
-      md5: 416d51f2597f86e69bed8d510553decd
-      size: 2049
-    outs:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/judge_metrics_997.json
-      hash: md5
-      md5: 47460b229c988d7d4131556a88f7b8de
-      size: 1168
-  evaluate_llm_as_judge@llama_3.1_8b_instruct-Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned-42:
-    cmd: PYTHONPATH=. python scripts/sft/evaluate_llm_as_judge.py api_model=llama_3.1_8b_instruct
-      answers_file=data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned/outputs_42.json
-      out_metric_file=data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned/judge_metrics_42.json
-    deps:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned/outputs_42.json
-      hash: md5
-      md5: 76924fe96291bafebb4259eb39a14ba3
-      size: 1730656
-    - path: scripts/sft/evaluate_llm_as_judge.py
-      hash: md5
-      md5: 416d51f2597f86e69bed8d510553decd
-      size: 2049
-    outs:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned/judge_metrics_42.json
-      hash: md5
-      md5: a09671885113a77aa5988f79a599f168
-      size: 1174
-  evaluate_llm_as_judge@llama_3.1_8b_instruct-Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned-7312:
-    cmd: PYTHONPATH=. python scripts/sft/evaluate_llm_as_judge.py api_model=llama_3.1_8b_instruct
-      answers_file=data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned/outputs_7312.json
-      out_metric_file=data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned/judge_metrics_7312.json
-    deps:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned/outputs_7312.json
-      hash: md5
-      md5: 4cf7e1890a19d7dd0ccc274862afc2b7
-      size: 1718147
-    - path: scripts/sft/evaluate_llm_as_judge.py
-      hash: md5
-      md5: 416d51f2597f86e69bed8d510553decd
-      size: 2049
-    outs:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned/judge_metrics_7312.json
-      hash: md5
-      md5: 053bcc64b1d98b2c8209954aa0cb492e
-      size: 1169
-  evaluate_llm_as_judge@llama_3.1_8b_instruct-Unsloth-Mistral-Nemo-Instruct-2407-42:
-    cmd: PYTHONPATH=. python scripts/sft/evaluate_llm_as_judge.py api_model=llama_3.1_8b_instruct
-      answers_file=data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/outputs_42.json
-      out_metric_file=data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/judge_metrics_42.json
-    deps:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/outputs_42.json
-      hash: md5
-      md5: 1385f49966e9db2a88a17f53d0887ad8
-      size: 1741944
-    - path: scripts/sft/evaluate_llm_as_judge.py
-      hash: md5
-      md5: 416d51f2597f86e69bed8d510553decd
-      size: 2049
-    outs:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/judge_metrics_42.json
-      hash: md5
-      md5: 59a0ded89c8d51acbb8e96d5de7ac8ab
-      size: 1169
-  evaluate_llm_as_judge@llama_3.1_8b_instruct-Unsloth-Mistral-Nemo-Instruct-2407-7312:
-    cmd: PYTHONPATH=. python scripts/sft/evaluate_llm_as_judge.py api_model=llama_3.1_8b_instruct
-      answers_file=data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/outputs_7312.json
-      out_metric_file=data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/judge_metrics_7312.json
-    deps:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/outputs_7312.json
-      hash: md5
-      md5: 924744efce1483e9128579cad7a4454c
-      size: 1748772
-    - path: scripts/sft/evaluate_llm_as_judge.py
-      hash: md5
-      md5: 416d51f2597f86e69bed8d510553decd
-      size: 2049
-    outs:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/judge_metrics_7312.json
-      hash: md5
-      md5: 08d3a5a84b968296decf22c50a99b656
-      size: 1164
-  sft_unsloth@Unsloth-Mistral-Nemo-Instruct-2407:
-    cmd: PYTHONPATH=. python scripts/sft/fine_tune_llm.py model=Unsloth-Mistral-Nemo-Instruct-2407
-    deps:
-    - path: configs/fine_tuning.yaml
-      hash: md5
-      md5: 3933c4faf5a478d0f9d3963c3b29e5cc
-      size: 1356
-    - path: configs/model/Unsloth-Mistral-Nemo-Instruct-2407.yaml
-      hash: md5
-      md5: ca5ac52e503c9f488f98f569811c76dc
-      size: 261
-    - path: scripts/sft/fine_tune_llm.py
-      hash: md5
-      md5: 4b77ee1ea604cae18f17ca00cdb6988b
-      size: 4578
-    outs:
-    - path: data/experiments/fine-tune/Unsloth-Mistral-Nemo-Instruct-2407/pl-court-instruct
-      hash: md5
-      md5: 80bceb56982e9bdb8d4b441bf843014f.dir
-      size: 1056899473
-      nfiles: 33
-  predict@Bielik-7B-Instruct-v0.1-997:
-    cmd: PYTHONPATH=. python scripts/sft/predict.py model=Bielik-7B-Instruct-v0.1
-      random_seed=997 
-      output_file=data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1/outputs_997.json
-    deps:
-    - path: configs/model/Bielik-7B-Instruct-v0.1.yaml
-      hash: md5
-      md5: c3412525e9819b53fbad06363a07a871
-      size: 173
-    - path: configs/predict.yaml
-      hash: md5
-      md5: 5fc8b9ac571d4a2209d7d866697252ab
-      size: 402
-    - path: scripts/sft/predict.py
-      hash: md5
-      md5: f9acd63cd4d682ae2242d7b51f0d974b
-      size: 3198
-    outs:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1/outputs_997.json
-      hash: md5
-      md5: fac04d78ad020b50f79fc7277a037e8e
-      size: 2016400
-  predict_with_api@gpt-4o-997:
-    cmd: PYTHONPATH=. python scripts/sft/predict_with_api.py model_version=gpt-4o
-      seed=997 
-      output_file=data/experiments/predict/pl-court-instruct/open_ai_gpt-4o/outputs_997.json
-    deps:
-    - path: configs/predict_with_api.yaml
-      hash: md5
-      md5: aff18078742a14c3d8ce2cd74e718d44
-      size: 320
-    - path: scripts/sft/predict_with_api.py
-      hash: md5
-      md5: 610d32b0036ae6eef4480c5a30f07999
-      size: 3987
-    outs:
-    - path: data/experiments/predict/pl-court-instruct/open_ai_gpt-4o/outputs_997.json
-      hash: md5
-      md5: 7c5833fdd1419163b286baaa3d71e084
-      size: 1965252
-  predict_with_api@gpt-4o-mini-997:
-    cmd: PYTHONPATH=. python scripts/sft/predict_with_api.py model_version=gpt-4o-mini
-      seed=997 
-      output_file=data/experiments/predict/pl-court-instruct/open_ai_gpt-4o-mini/outputs_997.json
-    deps:
-    - path: configs/predict_with_api.yaml
-      hash: md5
-      md5: aff18078742a14c3d8ce2cd74e718d44
-      size: 320
-    - path: scripts/sft/predict_with_api.py
-      hash: md5
-      md5: 610d32b0036ae6eef4480c5a30f07999
-      size: 3987
-    outs:
-    - path: data/experiments/predict/pl-court-instruct/open_ai_gpt-4o-mini/outputs_997.json
-      hash: md5
-      md5: 839c911f542cd7c60c9ae52ef95e9907
-      size: 1812429
-  evaluate@open_ai_gpt-4o-mini-997:
-    cmd: PYTHONPATH=. python scripts/sft/evaluate.py --output-file data/experiments/predict/pl-court-instruct/open_ai_gpt-4o-mini/outputs_997.json
-    deps:
-    - path: data/experiments/predict/pl-court-instruct/open_ai_gpt-4o-mini/outputs_997.json
-      hash: md5
-      md5: 839c911f542cd7c60c9ae52ef95e9907
-      size: 1812429
-    - path: scripts/sft/evaluate.py
-      hash: md5
-      md5: 73aa4a7eb8a035c087702457b9401654
-      size: 636
-    outs:
-    - path: data/experiments/predict/pl-court-instruct/open_ai_gpt-4o-mini/metrics_997.json
-      hash: md5
-      md5: fe43f0d25b500a0f2fb2d8199b8034fd
-      size: 305
-  predict@Bielik-7B-Instruct-v0.1-42:
-    cmd: PYTHONPATH=. python scripts/sft/predict.py model=Bielik-7B-Instruct-v0.1
-      random_seed=42 
-      output_file=data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1/outputs_42.json
-    deps:
-    - path: configs/model/Bielik-7B-Instruct-v0.1.yaml
-      hash: md5
-      md5: c3412525e9819b53fbad06363a07a871
-      size: 173
-    - path: configs/predict.yaml
-      hash: md5
-      md5: 5fc8b9ac571d4a2209d7d866697252ab
-      size: 402
-    - path: scripts/sft/predict.py
-      hash: md5
-      md5: f9acd63cd4d682ae2242d7b51f0d974b
-      size: 3198
-    outs:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1/outputs_42.json
-      hash: md5
-      md5: 2dc39513a04910c5d0c54380166639d9
-      size: 2029644
-  predict@Bielik-7B-Instruct-v0.1-7312:
-    cmd: PYTHONPATH=. python scripts/sft/predict.py model=Bielik-7B-Instruct-v0.1
-      random_seed=7312 
-      output_file=data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1/outputs_7312.json
-    deps:
-    - path: configs/model/Bielik-7B-Instruct-v0.1.yaml
-      hash: md5
-      md5: c3412525e9819b53fbad06363a07a871
-      size: 173
-    - path: configs/predict.yaml
-      hash: md5
-      md5: 5fc8b9ac571d4a2209d7d866697252ab
-      size: 402
-    - path: scripts/sft/predict.py
-      hash: md5
-      md5: f9acd63cd4d682ae2242d7b51f0d974b
-      size: 3198
-    outs:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1/outputs_7312.json
-      hash: md5
-      md5: ae39bf31296ffe82c0f6a3e8c9ff63aa
-      size: 2014399
-  evaluate@open_ai_gpt-4o-997:
-    cmd: PYTHONPATH=. python scripts/sft/evaluate.py --output-file data/experiments/predict/pl-court-instruct/open_ai_gpt-4o/outputs_997.json
-    deps:
-    - path: data/experiments/predict/pl-court-instruct/open_ai_gpt-4o/outputs_997.json
-      hash: md5
-      md5: 7c5833fdd1419163b286baaa3d71e084
-      size: 1965252
-    - path: scripts/sft/evaluate.py
-      hash: md5
-      md5: 73aa4a7eb8a035c087702457b9401654
-      size: 636
-    outs:
-    - path: data/experiments/predict/pl-court-instruct/open_ai_gpt-4o/metrics_997.json
-      hash: md5
-      md5: 65c808d4aebd8efe37b94a5128a19de6
-      size: 306
-  sft_unsloth@Bielik-7B-Instruct-v0.1:
-    cmd: PYTHONPATH=. python scripts/sft/fine_tune_llm.py model=Bielik-7B-Instruct-v0.1
-    deps:
-    - path: configs/fine_tuning.yaml
-      hash: md5
-      md5: 3933c4faf5a478d0f9d3963c3b29e5cc
-      size: 1356
-    - path: configs/model/Bielik-7B-Instruct-v0.1.yaml
-      hash: md5
-      md5: c3412525e9819b53fbad06363a07a871
-      size: 173
-    - path: scripts/sft/fine_tune_llm.py
-      hash: md5
-      md5: 4b77ee1ea604cae18f17ca00cdb6988b
-      size: 4578
-    outs:
-    - path: data/experiments/fine-tune/Bielik-7B-Instruct-v0.1/pl-court-instruct
-      hash: md5
-      md5: be61ab5ea1365c1bcf908952bc015ab4.dir
-      size: 2293711014
-      nfiles: 108
-  predict@Bielik-7B-Instruct-v0.1-fine-tuned-42:
-    cmd: PYTHONPATH=. python scripts/sft/predict.py model=Bielik-7B-Instruct-v0.1-fine-tuned
-      random_seed=42 
-      output_file=data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1-fine-tuned/outputs_42.json
-    deps:
-    - path: configs/model/Bielik-7B-Instruct-v0.1-fine-tuned.yaml
-      hash: md5
-      md5: 2d9590869dfe247d7c6335f3cd7dd0c2
-      size: 253
-    - path: configs/predict.yaml
-      hash: md5
-      md5: 5fc8b9ac571d4a2209d7d866697252ab
-      size: 402
-    - path: scripts/sft/predict.py
-      hash: md5
-      md5: f9acd63cd4d682ae2242d7b51f0d974b
-      size: 3198
-    outs:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1-fine-tuned/outputs_42.json
-      hash: md5
-      md5: 178eb0649617d4a698da6c9e315e84c5
-      size: 2034749
-  predict@Bielik-7B-Instruct-v0.1-fine-tuned-7312:
-    cmd: PYTHONPATH=. python scripts/sft/predict.py model=Bielik-7B-Instruct-v0.1-fine-tuned
-      random_seed=7312 
-      output_file=data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1-fine-tuned/outputs_7312.json
-    deps:
-    - path: configs/model/Bielik-7B-Instruct-v0.1-fine-tuned.yaml
-      hash: md5
-      md5: 2d9590869dfe247d7c6335f3cd7dd0c2
-      size: 253
-    - path: configs/predict.yaml
-      hash: md5
-      md5: 5fc8b9ac571d4a2209d7d866697252ab
-      size: 402
-    - path: scripts/sft/predict.py
-      hash: md5
-      md5: f9acd63cd4d682ae2242d7b51f0d974b
-      size: 3198
-    outs:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1-fine-tuned/outputs_7312.json
-      hash: md5
-      md5: 743ea22448bc73a7a991da075fca8841
-      size: 2031343
-  predict@Bielik-7B-Instruct-v0.1-fine-tuned-997:
-    cmd: PYTHONPATH=. python scripts/sft/predict.py model=Bielik-7B-Instruct-v0.1-fine-tuned
-      random_seed=997 
-      output_file=data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1-fine-tuned/outputs_997.json
-    deps:
-    - path: configs/model/Bielik-7B-Instruct-v0.1-fine-tuned.yaml
-      hash: md5
-      md5: 2d9590869dfe247d7c6335f3cd7dd0c2
-      size: 253
-    - path: configs/predict.yaml
-      hash: md5
-      md5: 5fc8b9ac571d4a2209d7d866697252ab
-      size: 402
-    - path: scripts/sft/predict.py
-      hash: md5
-      md5: f9acd63cd4d682ae2242d7b51f0d974b
-      size: 3198
-    outs:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1-fine-tuned/outputs_997.json
-      hash: md5
-      md5: 433a4b2aa7870a134277a265d099a588
-      size: 2029482
-  evaluate@Bielik-7B-Instruct-v0.1-42:
-    cmd: PYTHONPATH=. python scripts/sft/evaluate.py --output-file 
-      data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1/outputs_42.json
-    deps:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1/outputs_42.json
-      hash: md5
-      md5: 2dc39513a04910c5d0c54380166639d9
-      size: 2029644
-    - path: scripts/sft/evaluate.py
-      hash: md5
-      md5: 73aa4a7eb8a035c087702457b9401654
-      size: 636
-    outs:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1/metrics_42.json
-      hash: md5
-      md5: 2cbca38fd0bbdb4df024f76506eeb26c
-      size: 307
-  evaluate@Bielik-7B-Instruct-v0.1-7312:
-    cmd: PYTHONPATH=. python scripts/sft/evaluate.py --output-file 
-      data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1/outputs_7312.json
-    deps:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1/outputs_7312.json
-      hash: md5
-      md5: ae39bf31296ffe82c0f6a3e8c9ff63aa
-      size: 2014399
-    - path: scripts/sft/evaluate.py
-      hash: md5
-      md5: 73aa4a7eb8a035c087702457b9401654
-      size: 636
-    outs:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1/metrics_7312.json
-      hash: md5
-      md5: ad13d47ca88e721be75c79c225e12ee6
-      size: 289
-  evaluate@Bielik-7B-Instruct-v0.1-997:
-    cmd: PYTHONPATH=. python scripts/sft/evaluate.py --output-file 
-      data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1/outputs_997.json
-    deps:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1/outputs_997.json
-      hash: md5
-      md5: fac04d78ad020b50f79fc7277a037e8e
-      size: 2016400
-    - path: scripts/sft/evaluate.py
-      hash: md5
-      md5: 73aa4a7eb8a035c087702457b9401654
-      size: 636
-    outs:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1/metrics_997.json
-      hash: md5
-      md5: 83fb160145ef5e21b43f7c348658ea02
-      size: 327
-  evaluate@Bielik-7B-Instruct-v0.1-fine-tuned-42:
-    cmd: PYTHONPATH=. python scripts/sft/evaluate.py --output-file 
-      data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1-fine-tuned/outputs_42.json
-    deps:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1-fine-tuned/outputs_42.json
-      hash: md5
-      md5: 178eb0649617d4a698da6c9e315e84c5
-      size: 2034749
-    - path: scripts/sft/evaluate.py
-      hash: md5
-      md5: 73aa4a7eb8a035c087702457b9401654
-      size: 636
-    outs:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1-fine-tuned/metrics_42.json
-      hash: md5
-      md5: 99e684c720ca4c4ef6c4276e7d1880ab
-      size: 305
-  evaluate@Bielik-7B-Instruct-v0.1-fine-tuned-7312:
-    cmd: PYTHONPATH=. python scripts/sft/evaluate.py --output-file 
-      data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1-fine-tuned/outputs_7312.json
-    deps:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1-fine-tuned/outputs_7312.json
-      hash: md5
-      md5: 743ea22448bc73a7a991da075fca8841
-      size: 2031343
-    - path: scripts/sft/evaluate.py
-      hash: md5
-      md5: 73aa4a7eb8a035c087702457b9401654
-      size: 636
-    outs:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1-fine-tuned/metrics_7312.json
-      hash: md5
-      md5: 07d798079cedf3dc194242d6a1bc3bcd
-      size: 306
-  evaluate@Bielik-7B-Instruct-v0.1-fine-tuned-997:
-    cmd: PYTHONPATH=. python scripts/sft/evaluate.py --output-file 
-      data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1-fine-tuned/outputs_997.json
-    deps:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1-fine-tuned/outputs_997.json
-      hash: md5
-      md5: 433a4b2aa7870a134277a265d099a588
-      size: 2029482
-    - path: scripts/sft/evaluate.py
-      hash: md5
-      md5: 73aa4a7eb8a035c087702457b9401654
-      size: 636
-    outs:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1-fine-tuned/metrics_997.json
-      hash: md5
-      md5: adb7c1e239396bbf6e308f3f1b436099
-      size: 307
-  build_instruct_dataset_en:
-    cmd: PYTHONPATH=. python scripts/dataset/build_instruct_dataset_en.py --repo-id
-      JuDDGES/en-court-instruct
-    deps:
-    - path: scripts/dataset/build_instruct_dataset_en.py
-      hash: md5
-      md5: 39e530fbd8c7f3a696e117ee13578e1f
-      size: 5203
-  predict_with_api@en-court-instruct-gpt-4o-mini-997:
-    cmd: PYTHONPATH=. python scripts/sft/predict_with_api.py dataset=en-court-instruct
-      model_version=gpt-4o-mini seed=997 
-      output_file=data/experiments/predict/en-court-instruct/open_ai_gpt-4o-mini/outputs_997.json
-    deps:
-    - path: configs/predict_with_api.yaml
-      hash: md5
-      md5: aff18078742a14c3d8ce2cd74e718d44
-      size: 320
-    - path: scripts/sft/predict_with_api.py
-      hash: md5
-      md5: 142508c7b6df391083b0e81a3a6c4795
-      size: 3968
-    outs:
-    - path: data/experiments/predict/en-court-instruct/open_ai_gpt-4o-mini/outputs_997.json
-      hash: md5
-      md5: 2a0819011b3eac56e497201a9f67e310
-      size: 690306
-  evaluate@en-court-instruct-open_ai_gpt-4o-mini-997:
-    cmd: PYTHONPATH=. python scripts/sft/evaluate.py --output-file data/experiments/predict/en-court-instruct/open_ai_gpt-4o-mini/outputs_997.json
-    deps:
-    - path: data/experiments/predict/en-court-instruct/open_ai_gpt-4o-mini/outputs_997.json
-      hash: md5
-      md5: 2a0819011b3eac56e497201a9f67e310
-      size: 690306
-    - path: scripts/sft/evaluate.py
-      hash: md5
-      md5: 73aa4a7eb8a035c087702457b9401654
-      size: 636
-    outs:
-    - path: data/experiments/predict/en-court-instruct/open_ai_gpt-4o-mini/metrics_997.json
-      hash: md5
-      md5: d70eb0821aff9c9e874a421b80f7f697
-      size: 155
-  predict@pl-court-instruct-Unsloth-Llama-3-8B-Instruct-42:
-    cmd: PYTHONPATH=. python scripts/sft/predict.py dataset=pl-court-instruct model=Unsloth-Llama-3-8B-Instruct
-      random_seed=42 
-      output_file=data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct/outputs_42.json
-    deps:
-    - path: configs/model/Unsloth-Llama-3-8B-Instruct.yaml
-      hash: md5
-      md5: 56a95874b3e77e7ffec11c00330da5b6
-      size: 176
-    - path: configs/predict.yaml
-      hash: md5
-      md5: 5fc8b9ac571d4a2209d7d866697252ab
-      size: 402
-    - path: scripts/sft/predict.py
-      hash: md5
-      md5: f9acd63cd4d682ae2242d7b51f0d974b
-      size: 3198
-    outs:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct/outputs_42.json
-      hash: md5
-      md5: e99c88720116c951087b6125e5f4be4d
-      size: 2008073
-  predict@pl-court-instruct-Unsloth-Llama-3-8B-Instruct-7312:
-    cmd: PYTHONPATH=. python scripts/sft/predict.py dataset=pl-court-instruct model=Unsloth-Llama-3-8B-Instruct
-      random_seed=7312 
-      output_file=data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct/outputs_7312.json
-    deps:
-    - path: configs/model/Unsloth-Llama-3-8B-Instruct.yaml
-      hash: md5
-      md5: 56a95874b3e77e7ffec11c00330da5b6
-      size: 176
-    - path: configs/predict.yaml
-      hash: md5
-      md5: 5fc8b9ac571d4a2209d7d866697252ab
-      size: 402
-    - path: scripts/sft/predict.py
-      hash: md5
-      md5: f9acd63cd4d682ae2242d7b51f0d974b
-      size: 3198
-    outs:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct/outputs_7312.json
-      hash: md5
-      md5: 4c25368aacb7402b1b2cae9368d187d1
-      size: 2013637
-  predict@pl-court-instruct-Unsloth-Llama-3-8B-Instruct-997:
-    cmd: PYTHONPATH=. python scripts/sft/predict.py dataset=pl-court-instruct model=Unsloth-Llama-3-8B-Instruct
-      random_seed=997 
-      output_file=data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct/outputs_997.json
-    deps:
-    - path: configs/model/Unsloth-Llama-3-8B-Instruct.yaml
-      hash: md5
-      md5: 56a95874b3e77e7ffec11c00330da5b6
-      size: 176
-    - path: configs/predict.yaml
-      hash: md5
-      md5: 5fc8b9ac571d4a2209d7d866697252ab
-      size: 402
-    - path: scripts/sft/predict.py
-      hash: md5
-      md5: f9acd63cd4d682ae2242d7b51f0d974b
-      size: 3198
-    outs:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct/outputs_997.json
-      hash: md5
-      md5: baef589507248af212aaae51602fd999
-      size: 2010150
-  predict@pl-court-instruct-Unsloth-Llama-3-8B-Instruct-fine-tuned-42:
-    cmd: PYTHONPATH=. python scripts/sft/predict.py dataset=pl-court-instruct model=Unsloth-Llama-3-8B-Instruct-fine-tuned
-      random_seed=42 
-      output_file=data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct-fine-tuned/outputs_42.json
-    deps:
-    - path: configs/model/Unsloth-Llama-3-8B-Instruct-fine-tuned.yaml
-      hash: md5
-      md5: 3906c39a5c516f89ddafb7eff21615cd
-      size: 275
-    - path: configs/predict.yaml
-      hash: md5
-      md5: 5fc8b9ac571d4a2209d7d866697252ab
-      size: 402
-    - path: scripts/sft/predict.py
-      hash: md5
-      md5: f9acd63cd4d682ae2242d7b51f0d974b
-      size: 3198
-    outs:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct-fine-tuned/outputs_42.json
-      hash: md5
-      md5: 289b719e8c7166e578417e5706bdc4e3
-      size: 1760355
-  predict@pl-court-instruct-Unsloth-Llama-3-8B-Instruct-fine-tuned-7312:
-    cmd: PYTHONPATH=. python scripts/sft/predict.py dataset=pl-court-instruct model=Unsloth-Llama-3-8B-Instruct-fine-tuned
-      random_seed=7312 
-      output_file=data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct-fine-tuned/outputs_7312.json
-    deps:
-    - path: configs/model/Unsloth-Llama-3-8B-Instruct-fine-tuned.yaml
-      hash: md5
-      md5: 3906c39a5c516f89ddafb7eff21615cd
-      size: 275
-    - path: configs/predict.yaml
-      hash: md5
-      md5: 5fc8b9ac571d4a2209d7d866697252ab
-      size: 402
-    - path: scripts/sft/predict.py
-      hash: md5
-      md5: f9acd63cd4d682ae2242d7b51f0d974b
-      size: 3198
-    outs:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct-fine-tuned/outputs_7312.json
-      hash: md5
-      md5: 25bee3b4ee09b36d636095b4c927a0d3
-      size: 1759194
-  predict@pl-court-instruct-Unsloth-Llama-3-8B-Instruct-fine-tuned-997:
-    cmd: PYTHONPATH=. python scripts/sft/predict.py dataset=pl-court-instruct model=Unsloth-Llama-3-8B-Instruct-fine-tuned
-      random_seed=997 
-      output_file=data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct-fine-tuned/outputs_997.json
-    deps:
-    - path: configs/model/Unsloth-Llama-3-8B-Instruct-fine-tuned.yaml
-      hash: md5
-      md5: 3906c39a5c516f89ddafb7eff21615cd
-      size: 275
-    - path: configs/predict.yaml
-      hash: md5
-      md5: 5fc8b9ac571d4a2209d7d866697252ab
-      size: 402
-    - path: scripts/sft/predict.py
-      hash: md5
-      md5: f9acd63cd4d682ae2242d7b51f0d974b
-      size: 3198
-    outs:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct-fine-tuned/outputs_997.json
-      hash: md5
-      md5: 82b2c535d99d91b9a34986375bfa31a9
-      size: 1758747
-  predict@pl-court-instruct-Unsloth-Mistral-7B-Instruct-v0.3-42:
-    cmd: PYTHONPATH=. python scripts/sft/predict.py dataset=pl-court-instruct model=Unsloth-Mistral-7B-Instruct-v0.3
-      random_seed=42 
-      output_file=data/experiments/predict/pl-court-instruct/Unsloth-Mistral-7B-Instruct-v0.3/outputs_42.json
-    deps:
-    - path: configs/model/Unsloth-Mistral-7B-Instruct-v0.3.yaml
-      hash: md5
-      md5: d184e20107315876e7751bdc7c3841ad
-      size: 182
-    - path: configs/predict.yaml
-      hash: md5
-      md5: 5fc8b9ac571d4a2209d7d866697252ab
-      size: 402
-    - path: scripts/sft/predict.py
-      hash: md5
-      md5: f9acd63cd4d682ae2242d7b51f0d974b
-      size: 3198
-    outs:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Unsloth-Mistral-7B-Instruct-v0.3/outputs_42.json
-      hash: md5
-      md5: 4a9d3a2bb1dd47a732bd2df8102bc93f
-      size: 1799957
-  predict@pl-court-instruct-Unsloth-Mistral-7B-Instruct-v0.3-7312:
-    cmd: PYTHONPATH=. python scripts/sft/predict.py dataset=pl-court-instruct model=Unsloth-Mistral-7B-Instruct-v0.3
-      random_seed=7312 
-      output_file=data/experiments/predict/pl-court-instruct/Unsloth-Mistral-7B-Instruct-v0.3/outputs_7312.json
-    deps:
-    - path: configs/model/Unsloth-Mistral-7B-Instruct-v0.3.yaml
-      hash: md5
-      md5: d184e20107315876e7751bdc7c3841ad
-      size: 182
-    - path: configs/predict.yaml
-      hash: md5
-      md5: 5fc8b9ac571d4a2209d7d866697252ab
-      size: 402
-    - path: scripts/sft/predict.py
-      hash: md5
-      md5: f9acd63cd4d682ae2242d7b51f0d974b
-      size: 3198
-    outs:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Unsloth-Mistral-7B-Instruct-v0.3/outputs_7312.json
-      hash: md5
-      md5: 77e10dd2ec17e12e171e4bcab1a48e08
-      size: 1795629
-  predict@pl-court-instruct-Unsloth-Mistral-7B-Instruct-v0.3-997:
-    cmd: PYTHONPATH=. python scripts/sft/predict.py dataset=pl-court-instruct model=Unsloth-Mistral-7B-Instruct-v0.3
-      random_seed=997 
-      output_file=data/experiments/predict/pl-court-instruct/Unsloth-Mistral-7B-Instruct-v0.3/outputs_997.json
-    deps:
-    - path: configs/model/Unsloth-Mistral-7B-Instruct-v0.3.yaml
-      hash: md5
-      md5: d184e20107315876e7751bdc7c3841ad
-      size: 182
-    - path: configs/predict.yaml
-      hash: md5
-      md5: 5fc8b9ac571d4a2209d7d866697252ab
-      size: 402
-    - path: scripts/sft/predict.py
-      hash: md5
-      md5: f9acd63cd4d682ae2242d7b51f0d974b
-      size: 3198
-    outs:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Unsloth-Mistral-7B-Instruct-v0.3/outputs_997.json
-      hash: md5
-      md5: bbb883aa388b274bef3e9296df26f68f
-      size: 1795752
-  predict@pl-court-instruct-Unsloth-Mistral-7B-Instruct-v0.3-fine-tuned-42:
-    cmd: PYTHONPATH=. python scripts/sft/predict.py dataset=pl-court-instruct model=Unsloth-Mistral-7B-Instruct-v0.3-fine-tuned
-      random_seed=42 
-      output_file=data/experiments/predict/pl-court-instruct/Unsloth-Mistral-7B-Instruct-v0.3-fine-tuned/outputs_42.json
-    deps:
-    - path: configs/model/Unsloth-Mistral-7B-Instruct-v0.3-fine-tuned.yaml
-      hash: md5
-      md5: 828382dc03dbed80cff4a3358321dc4a
-      size: 271
-    - path: configs/predict.yaml
-      hash: md5
-      md5: 5fc8b9ac571d4a2209d7d866697252ab
-      size: 402
-    - path: scripts/sft/predict.py
-      hash: md5
-      md5: f9acd63cd4d682ae2242d7b51f0d974b
-      size: 3198
+    - hash: md5
+      md5: 787c129090aa1b64e337b236a4391402
+      path: data/experiments/predict/en-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned-en/outputs_997.json
+      size: 642477
+    - hash: md5
+      md5: 0644efb76af2c5461185e37a07ba2c17
+      path: scripts/sft/evaluate.py
+      size: 697
     outs:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Unsloth-Mistral-7B-Instruct-v0.3-fine-tuned/outputs_42.json
-      hash: md5
-      md5: 156091297490d893f9815d2ffcf17cbf
-      size: 1792160
-  predict@pl-court-instruct-Unsloth-Mistral-7B-Instruct-v0.3-fine-tuned-7312:
-    cmd: PYTHONPATH=. python scripts/sft/predict.py dataset=pl-court-instruct model=Unsloth-Mistral-7B-Instruct-v0.3-fine-tuned
-      random_seed=7312 
-      output_file=data/experiments/predict/pl-court-instruct/Unsloth-Mistral-7B-Instruct-v0.3-fine-tuned/outputs_7312.json
+    - hash: md5
+      md5: f3339245ea358de4b1348c8393153946
+      path: data/experiments/predict/en-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned-en/metrics_997.json
+      size: 157
+  evaluate_llm_as_judge_api_models@en-gpt_4o_mini-open_ai_gpt-4o-997:
+    cmd: PYTHONPATH=. python scripts/sft/evaluate_llm_as_judge.py api_model=gpt_4o_mini
+      answers_file=data/experiments/predict/en-court-instruct/open_ai_gpt-4o/outputs_997.json
+      out_metric_file=data/experiments/predict/en-court-instruct/open_ai_gpt-4o/judge_metrics_997.json
+      prompt=en
     deps:
-    - path: configs/model/Unsloth-Mistral-7B-Instruct-v0.3-fine-tuned.yaml
-      hash: md5
-      md5: 828382dc03dbed80cff4a3358321dc4a
-      size: 271
-    - path: configs/predict.yaml
-      hash: md5
-      md5: 5fc8b9ac571d4a2209d7d866697252ab
-      size: 402
-    - path: scripts/sft/predict.py
-      hash: md5
-      md5: f9acd63cd4d682ae2242d7b51f0d974b
-      size: 3198
+    - hash: md5
+      md5: 8f70e2baa0b0ae8a320577f5c8a60011
+      path: data/experiments/predict/en-court-instruct/open_ai_gpt-4o/outputs_997.json
+      size: 679432
+    - hash: md5
+      md5: 79a02fb864cb279f93fc4171043bb31c
+      path: scripts/sft/evaluate_llm_as_judge.py
+      size: 2253
     outs:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Unsloth-Mistral-7B-Instruct-v0.3-fine-tuned/outputs_7312.json
-      hash: md5
-      md5: 3d336675e54a706fae45349adbaf6ee4
-      size: 1793461
-  predict@pl-court-instruct-Unsloth-Mistral-7B-Instruct-v0.3-fine-tuned-997:
-    cmd: PYTHONPATH=. python scripts/sft/predict.py dataset=pl-court-instruct model=Unsloth-Mistral-7B-Instruct-v0.3-fine-tuned
-      random_seed=997 
-      output_file=data/experiments/predict/pl-court-instruct/Unsloth-Mistral-7B-Instruct-v0.3-fine-tuned/outputs_997.json
+    - hash: md5
+      md5: 1ad8736bed0fff4e88a9c32775f370bf
+      path: data/experiments/predict/en-court-instruct/open_ai_gpt-4o/judge_metrics_997.json
+      size: 481
+  evaluate_llm_as_judge_api_models@en-gpt_4o_mini-open_ai_gpt-4o-mini-997:
+    cmd: PYTHONPATH=. python scripts/sft/evaluate_llm_as_judge.py api_model=gpt_4o_mini
+      answers_file=data/experiments/predict/en-court-instruct/open_ai_gpt-4o-mini/outputs_997.json
+      out_metric_file=data/experiments/predict/en-court-instruct/open_ai_gpt-4o-mini/judge_metrics_997.json
+      prompt=en
     deps:
-    - path: configs/model/Unsloth-Mistral-7B-Instruct-v0.3-fine-tuned.yaml
-      hash: md5
-      md5: 828382dc03dbed80cff4a3358321dc4a
-      size: 271
-    - path: configs/predict.yaml
-      hash: md5
-      md5: 5fc8b9ac571d4a2209d7d866697252ab
-      size: 402
-    - path: scripts/sft/predict.py
-      hash: md5
-      md5: f9acd63cd4d682ae2242d7b51f0d974b
-      size: 3198
+    - hash: md5
+      md5: 2a0819011b3eac56e497201a9f67e310
+      path: data/experiments/predict/en-court-instruct/open_ai_gpt-4o-mini/outputs_997.json
+      size: 690306
+    - hash: md5
+      md5: 79a02fb864cb279f93fc4171043bb31c
+      path: scripts/sft/evaluate_llm_as_judge.py
+      size: 2253
     outs:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Unsloth-Mistral-7B-Instruct-v0.3-fine-tuned/outputs_997.json
-      hash: md5
-      md5: 55d682fba1c08c68552e98be6b503b4e
-      size: 1790731
-  predict@pl-court-instruct-Unsloth-Mistral-Nemo-Instruct-2407-42:
-    cmd: PYTHONPATH=. python scripts/sft/predict.py dataset=pl-court-instruct model=Unsloth-Mistral-Nemo-Instruct-2407
-      random_seed=42 
-      output_file=data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/outputs_42.json
+    - hash: md5
+      md5: bd272bea099716c0c2e689a2d19c0071
+      path: data/experiments/predict/en-court-instruct/open_ai_gpt-4o-mini/judge_metrics_997.json
+      size: 488
+  evaluate_llm_as_judge_api_models@pl-gpt_4o_mini-open_ai_gpt-4o-997:
+    cmd: PYTHONPATH=. python scripts/sft/evaluate_llm_as_judge.py api_model=gpt_4o_mini
+      answers_file=data/experiments/predict/pl-court-instruct/open_ai_gpt-4o/outputs_997.json
+      out_metric_file=data/experiments/predict/pl-court-instruct/open_ai_gpt-4o/judge_metrics_997.json
+      prompt=pl
     deps:
-    - path: configs/model/Unsloth-Mistral-Nemo-Instruct-2407.yaml
-      hash: md5
-      md5: ca5ac52e503c9f488f98f569811c76dc
-      size: 261
-    - path: configs/predict.yaml
-      hash: md5
-      md5: 5fc8b9ac571d4a2209d7d866697252ab
-      size: 402
-    - path: scripts/sft/predict.py
-      hash: md5
-      md5: f9acd63cd4d682ae2242d7b51f0d974b
-      size: 3198
+    - hash: md5
+      md5: 7c5833fdd1419163b286baaa3d71e084
+      path: data/experiments/predict/pl-court-instruct/open_ai_gpt-4o/outputs_997.json
+      size: 1965252
+    - hash: md5
+      md5: 79a02fb864cb279f93fc4171043bb31c
+      path: scripts/sft/evaluate_llm_as_judge.py
+      size: 2253
     outs:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/outputs_42.json
-      hash: md5
-      md5: 1385f49966e9db2a88a17f53d0887ad8
-      size: 1741944
-  predict@pl-court-instruct-Unsloth-Mistral-Nemo-Instruct-2407-7312:
-    cmd: PYTHONPATH=. python scripts/sft/predict.py dataset=pl-court-instruct model=Unsloth-Mistral-Nemo-Instruct-2407
-      random_seed=7312 
-      output_file=data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/outputs_7312.json
+    - hash: md5
+      md5: 867f10aeb55a3bd46b08c8a75c3bfc60
+      path: data/experiments/predict/pl-court-instruct/open_ai_gpt-4o/judge_metrics_997.json
+      size: 1176
+  evaluate_llm_as_judge_api_models@pl-gpt_4o_mini-open_ai_gpt-4o-mini-997:
+    cmd: PYTHONPATH=. python scripts/sft/evaluate_llm_as_judge.py api_model=gpt_4o_mini
+      answers_file=data/experiments/predict/pl-court-instruct/open_ai_gpt-4o-mini/outputs_997.json
+      out_metric_file=data/experiments/predict/pl-court-instruct/open_ai_gpt-4o-mini/judge_metrics_997.json
+      prompt=pl
     deps:
-    - path: configs/model/Unsloth-Mistral-Nemo-Instruct-2407.yaml
-      hash: md5
-      md5: ca5ac52e503c9f488f98f569811c76dc
-      size: 261
-    - path: configs/predict.yaml
-      hash: md5
-      md5: 5fc8b9ac571d4a2209d7d866697252ab
-      size: 402
-    - path: scripts/sft/predict.py
-      hash: md5
-      md5: f9acd63cd4d682ae2242d7b51f0d974b
-      size: 3198
+    - hash: md5
+      md5: 839c911f542cd7c60c9ae52ef95e9907
+      path: data/experiments/predict/pl-court-instruct/open_ai_gpt-4o-mini/outputs_997.json
+      size: 1812429
+    - hash: md5
+      md5: 79a02fb864cb279f93fc4171043bb31c
+      path: scripts/sft/evaluate_llm_as_judge.py
+      size: 2253
     outs:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/outputs_7312.json
-      hash: md5
-      md5: 924744efce1483e9128579cad7a4454c
-      size: 1748772
-  predict@pl-court-instruct-Unsloth-Mistral-Nemo-Instruct-2407-997:
-    cmd: PYTHONPATH=. python scripts/sft/predict.py dataset=pl-court-instruct model=Unsloth-Mistral-Nemo-Instruct-2407
-      random_seed=997 
-      output_file=data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/outputs_997.json
+    - hash: md5
+      md5: 24037233e5abe74fe13f69dd4fc5e26a
+      path: data/experiments/predict/pl-court-instruct/open_ai_gpt-4o-mini/judge_metrics_997.json
+      size: 1173
+  evaluate_llm_as_judge_en@gpt_4o_mini-Unsloth-Llama-3-8B-Instruct-42:
+    cmd: PYTHONPATH=. python scripts/sft/evaluate_llm_as_judge.py api_model=gpt_4o_mini
+      answers_file=data/experiments/predict/en-court-instruct/Unsloth-Llama-3-8B-Instruct/outputs_42.json
+      out_metric_file=data/experiments/predict/en-court-instruct/Unsloth-Llama-3-8B-Instruct/judge_metrics_42.json
+      prompt=en
     deps:
-    - path: configs/model/Unsloth-Mistral-Nemo-Instruct-2407.yaml
-      hash: md5
-      md5: ca5ac52e503c9f488f98f569811c76dc
-      size: 261
-    - path: configs/predict.yaml
-      hash: md5
-      md5: 5fc8b9ac571d4a2209d7d866697252ab
-      size: 402
-    - path: scripts/sft/predict.py
-      hash: md5
-      md5: f9acd63cd4d682ae2242d7b51f0d974b
-      size: 3198
+    - hash: md5
+      md5: 761018c0a306fbee63dad2fbc119110d
+      path: data/experiments/predict/en-court-instruct/Unsloth-Llama-3-8B-Instruct/outputs_42.json
+      size: 821683
+    - hash: md5
+      md5: 79a02fb864cb279f93fc4171043bb31c
+      path: scripts/sft/evaluate_llm_as_judge.py
+      size: 2253
     outs:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/outputs_997.json
-      hash: md5
-      md5: 4d023797a9053fd7df61f6b1796112e9
-      size: 1747404
-  predict@pl-court-instruct-Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned-42:
-    cmd: PYTHONPATH=. python scripts/sft/predict.py dataset=pl-court-instruct model=Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned
-      random_seed=42 
-      output_file=data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned/outputs_42.json
+    - hash: md5
+      md5: 77ecbff8c82afbfd6fec098fb87e1218
+      path: data/experiments/predict/en-court-instruct/Unsloth-Llama-3-8B-Instruct/judge_metrics_42.json
+      size: 478
+  evaluate_llm_as_judge_en@gpt_4o_mini-Unsloth-Llama-3-8B-Instruct-7312:
+    cmd: PYTHONPATH=. python scripts/sft/evaluate_llm_as_judge.py api_model=gpt_4o_mini
+      answers_file=data/experiments/predict/en-court-instruct/Unsloth-Llama-3-8B-Instruct/outputs_7312.json
+      out_metric_file=data/experiments/predict/en-court-instruct/Unsloth-Llama-3-8B-Instruct/judge_metrics_7312.json
+      prompt=en
     deps:
-    - path: configs/model/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned.yaml
-      hash: md5
-      md5: 1d9e6407d121214f949d56ca5c3425f5
-      size: 367
-    - path: configs/predict.yaml
-      hash: md5
-      md5: 5fc8b9ac571d4a2209d7d866697252ab
-      size: 402
-    - path: scripts/sft/predict.py
-      hash: md5
-      md5: f9acd63cd4d682ae2242d7b51f0d974b
-      size: 3198
+    - hash: md5
+      md5: a7361535b440251d6ce6232a15cfcdf2
+      path: data/experiments/predict/en-court-instruct/Unsloth-Llama-3-8B-Instruct/outputs_7312.json
+      size: 818877
+    - hash: md5
+      md5: 79a02fb864cb279f93fc4171043bb31c
+      path: scripts/sft/evaluate_llm_as_judge.py
+      size: 2253
     outs:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned/outputs_42.json
-      hash: md5
-      md5: 14d4613f7d9495f5fb5f2d7b81f402a9
-      size: 1825646
-  predict@pl-court-instruct-Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned-7312:
-    cmd: PYTHONPATH=. python scripts/sft/predict.py dataset=pl-court-instruct model=Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned
-      random_seed=7312 
-      output_file=data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned/outputs_7312.json
+    - hash: md5
+      md5: f25c9ad98ef817e976def98d6b7d3b5d
+      path: data/experiments/predict/en-court-instruct/Unsloth-Llama-3-8B-Instruct/judge_metrics_7312.json
+      size: 482
+  evaluate_llm_as_judge_en@gpt_4o_mini-Unsloth-Llama-3-8B-Instruct-997:
+    cmd: PYTHONPATH=. python scripts/sft/evaluate_llm_as_judge.py api_model=gpt_4o_mini
+      answers_file=data/experiments/predict/en-court-instruct/Unsloth-Llama-3-8B-Instruct/outputs_997.json
+      out_metric_file=data/experiments/predict/en-court-instruct/Unsloth-Llama-3-8B-Instruct/judge_metrics_997.json
+      prompt=en
     deps:
-    - path: configs/model/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned.yaml
-      hash: md5
-      md5: 1d9e6407d121214f949d56ca5c3425f5
-      size: 367
-    - path: configs/predict.yaml
-      hash: md5
-      md5: 5fc8b9ac571d4a2209d7d866697252ab
-      size: 402
-    - path: scripts/sft/predict.py
-      hash: md5
-      md5: f9acd63cd4d682ae2242d7b51f0d974b
-      size: 3198
+    - hash: md5
+      md5: 94924275d576271875fecf22c0f9b39e
+      path: data/experiments/predict/en-court-instruct/Unsloth-Llama-3-8B-Instruct/outputs_997.json
+      size: 817490
+    - hash: md5
+      md5: 79a02fb864cb279f93fc4171043bb31c
+      path: scripts/sft/evaluate_llm_as_judge.py
+      size: 2253
     outs:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned/outputs_7312.json
-      hash: md5
-      md5: 302e1dc4f064007e3df88ac1e8acccc5
-      size: 1831330
-  predict@pl-court-instruct-Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned-997:
-    cmd: PYTHONPATH=. python scripts/sft/predict.py dataset=pl-court-instruct model=Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned
-      random_seed=997 
-      output_file=data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned/outputs_997.json
+    - hash: md5
+      md5: 4395c32931d25a1bd9aa092c5a0e5460
+      path: data/experiments/predict/en-court-instruct/Unsloth-Llama-3-8B-Instruct/judge_metrics_997.json
+      size: 478
+  evaluate_llm_as_judge_en@gpt_4o_mini-Unsloth-Llama-3-8B-Instruct-fine-tuned-en-42:
+    cmd: PYTHONPATH=. python scripts/sft/evaluate_llm_as_judge.py api_model=gpt_4o_mini
+      answers_file=data/experiments/predict/en-court-instruct/Unsloth-Llama-3-8B-Instruct-fine-tuned-en/outputs_42.json
+      out_metric_file=data/experiments/predict/en-court-instruct/Unsloth-Llama-3-8B-Instruct-fine-tuned-en/judge_metrics_42.json
+      prompt=en
     deps:
-    - path: configs/model/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned.yaml
-      hash: md5
-      md5: 1d9e6407d121214f949d56ca5c3425f5
-      size: 367
-    - path: configs/predict.yaml
-      hash: md5
-      md5: 5fc8b9ac571d4a2209d7d866697252ab
-      size: 402
-    - path: scripts/sft/predict.py
-      hash: md5
-      md5: f9acd63cd4d682ae2242d7b51f0d974b
-      size: 3198
+    - hash: md5
+      md5: 4246a4fafba5e130aac3db6c1c61ce30
+      path: data/experiments/predict/en-court-instruct/Unsloth-Llama-3-8B-Instruct-fine-tuned-en/outputs_42.json
+      size: 675578
+    - hash: md5
+      md5: 79a02fb864cb279f93fc4171043bb31c
+      path: scripts/sft/evaluate_llm_as_judge.py
+      size: 2253
     outs:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned/outputs_997.json
-      hash: md5
-      md5: 41a47dc56efc29b6c2771db68bdacb17
-      size: 1822491
-  predict@pl-court-instruct-Bielik-7B-Instruct-v0.1-42:
-    cmd: PYTHONPATH=. python scripts/sft/predict.py dataset=pl-court-instruct model=Bielik-7B-Instruct-v0.1
-      random_seed=42 
-      output_file=data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1/outputs_42.json
+    - hash: md5
+      md5: 5f2cea81c873a3b85ef95ba9a6dc90a5
+      path: data/experiments/predict/en-court-instruct/Unsloth-Llama-3-8B-Instruct-fine-tuned-en/judge_metrics_42.json
+      size: 487
+  evaluate_llm_as_judge_en@gpt_4o_mini-Unsloth-Llama-3-8B-Instruct-fine-tuned-en-7312:
+    cmd: PYTHONPATH=. python scripts/sft/evaluate_llm_as_judge.py api_model=gpt_4o_mini
+      answers_file=data/experiments/predict/en-court-instruct/Unsloth-Llama-3-8B-Instruct-fine-tuned-en/outputs_7312.json
+      out_metric_file=data/experiments/predict/en-court-instruct/Unsloth-Llama-3-8B-Instruct-fine-tuned-en/judge_metrics_7312.json
+      prompt=en
     deps:
-    - path: configs/model/Bielik-7B-Instruct-v0.1.yaml
-      hash: md5
-      md5: c3412525e9819b53fbad06363a07a871
-      size: 173
-    - path: configs/predict.yaml
-      hash: md5
-      md5: 5fc8b9ac571d4a2209d7d866697252ab
-      size: 402
-    - path: scripts/sft/predict.py
-      hash: md5
-      md5: f9acd63cd4d682ae2242d7b51f0d974b
-      size: 3198
+    - hash: md5
+      md5: f0b806eebca2f3ddf49d0ff821856b45
+      path: data/experiments/predict/en-court-instruct/Unsloth-Llama-3-8B-Instruct-fine-tuned-en/outputs_7312.json
+      size: 670935
+    - hash: md5
+      md5: 79a02fb864cb279f93fc4171043bb31c
+      path: scripts/sft/evaluate_llm_as_judge.py
+      size: 2253
     outs:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1/outputs_42.json
-      hash: md5
-      md5: 2dc39513a04910c5d0c54380166639d9
-      size: 2029644
-  predict@pl-court-instruct-Bielik-7B-Instruct-v0.1-7312:
-    cmd: PYTHONPATH=. python scripts/sft/predict.py dataset=pl-court-instruct model=Bielik-7B-Instruct-v0.1
-      random_seed=7312 
-      output_file=data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1/outputs_7312.json
+    - hash: md5
+      md5: 5cc45cac8a7607e42a8a394593d33396
+      path: data/experiments/predict/en-court-instruct/Unsloth-Llama-3-8B-Instruct-fine-tuned-en/judge_metrics_7312.json
+      size: 486
+  evaluate_llm_as_judge_en@gpt_4o_mini-Unsloth-Llama-3-8B-Instruct-fine-tuned-en-997:
+    cmd: PYTHONPATH=. python scripts/sft/evaluate_llm_as_judge.py api_model=gpt_4o_mini
+      answers_file=data/experiments/predict/en-court-instruct/Unsloth-Llama-3-8B-Instruct-fine-tuned-en/outputs_997.json
+      out_metric_file=data/experiments/predict/en-court-instruct/Unsloth-Llama-3-8B-Instruct-fine-tuned-en/judge_metrics_997.json
+      prompt=en
     deps:
-    - path: configs/model/Bielik-7B-Instruct-v0.1.yaml
-      hash: md5
-      md5: c3412525e9819b53fbad06363a07a871
-      size: 173
-    - path: configs/predict.yaml
-      hash: md5
-      md5: 5fc8b9ac571d4a2209d7d866697252ab
-      size: 402
-    - path: scripts/sft/predict.py
-      hash: md5
-      md5: f9acd63cd4d682ae2242d7b51f0d974b
-      size: 3198
+    - hash: md5
+      md5: 4e968cac351ad48ad786d1ecccbbc967
+      path: data/experiments/predict/en-court-instruct/Unsloth-Llama-3-8B-Instruct-fine-tuned-en/outputs_997.json
+      size: 670674
+    - hash: md5
+      md5: 79a02fb864cb279f93fc4171043bb31c
+      path: scripts/sft/evaluate_llm_as_judge.py
+      size: 2253
     outs:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1/outputs_7312.json
-      hash: md5
-      md5: ae39bf31296ffe82c0f6a3e8c9ff63aa
-      size: 2014399
-  predict@pl-court-instruct-Bielik-7B-Instruct-v0.1-997:
-    cmd: PYTHONPATH=. python scripts/sft/predict.py dataset=pl-court-instruct model=Bielik-7B-Instruct-v0.1
-      random_seed=997 
-      output_file=data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1/outputs_997.json
+    - hash: md5
+      md5: 90c2b0cd132130d0b9d3a60bf6fdd69b
+      path: data/experiments/predict/en-court-instruct/Unsloth-Llama-3-8B-Instruct-fine-tuned-en/judge_metrics_997.json
+      size: 486
+  evaluate_llm_as_judge_en@gpt_4o_mini-Unsloth-Mistral-Nemo-Instruct-2407-42:
+    cmd: PYTHONPATH=. python scripts/sft/evaluate_llm_as_judge.py api_model=gpt_4o_mini
+      answers_file=data/experiments/predict/en-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/outputs_42.json
+      out_metric_file=data/experiments/predict/en-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/judge_metrics_42.json
+      prompt=en
     deps:
-    - path: configs/model/Bielik-7B-Instruct-v0.1.yaml
-      hash: md5
-      md5: c3412525e9819b53fbad06363a07a871
-      size: 173
-    - path: configs/predict.yaml
-      hash: md5
-      md5: 5fc8b9ac571d4a2209d7d866697252ab
-      size: 402
-    - path: scripts/sft/predict.py
-      hash: md5
-      md5: f9acd63cd4d682ae2242d7b51f0d974b
-      size: 3198
+    - hash: md5
+      md5: 4fe25ad80a20ea5d6200136176b3e4ca
+      path: data/experiments/predict/en-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/outputs_42.json
+      size: 705218
+    - hash: md5
+      md5: 79a02fb864cb279f93fc4171043bb31c
+      path: scripts/sft/evaluate_llm_as_judge.py
+      size: 2253
     outs:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1/outputs_997.json
-      hash: md5
-      md5: fac04d78ad020b50f79fc7277a037e8e
-      size: 2016400
-  predict@pl-court-instruct-Bielik-7B-Instruct-v0.1-fine-tuned-42:
-    cmd: PYTHONPATH=. python scripts/sft/predict.py dataset=pl-court-instruct model=Bielik-7B-Instruct-v0.1-fine-tuned
-      random_seed=42 
-      output_file=data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1-fine-tuned/outputs_42.json
+    - hash: md5
+      md5: 69901f631da4ffefd09e7cbfac39cd89
+      path: data/experiments/predict/en-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/judge_metrics_42.json
+      size: 480
+  evaluate_llm_as_judge_en@gpt_4o_mini-Unsloth-Mistral-Nemo-Instruct-2407-7312:
+    cmd: PYTHONPATH=. python scripts/sft/evaluate_llm_as_judge.py api_model=gpt_4o_mini
+      answers_file=data/experiments/predict/en-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/outputs_7312.json
+      out_metric_file=data/experiments/predict/en-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/judge_metrics_7312.json
+      prompt=en
     deps:
-    - path: configs/model/Bielik-7B-Instruct-v0.1-fine-tuned.yaml
-      hash: md5
-      md5: 2d9590869dfe247d7c6335f3cd7dd0c2
-      size: 253
-    - path: configs/predict.yaml
-      hash: md5
-      md5: 5fc8b9ac571d4a2209d7d866697252ab
-      size: 402
-    - path: scripts/sft/predict.py
-      hash: md5
-      md5: f9acd63cd4d682ae2242d7b51f0d974b
-      size: 3198
+    - hash: md5
+      md5: cf4fdbf0e26e6c793bdca4edd6e365c0
+      path: data/experiments/predict/en-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/outputs_7312.json
+      size: 703876
+    - hash: md5
+      md5: 79a02fb864cb279f93fc4171043bb31c
+      path: scripts/sft/evaluate_llm_as_judge.py
+      size: 2253
     outs:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1-fine-tuned/outputs_42.json
-      hash: md5
-      md5: 178eb0649617d4a698da6c9e315e84c5
-      size: 2034749
-  predict@pl-court-instruct-Bielik-7B-Instruct-v0.1-fine-tuned-7312:
-    cmd: PYTHONPATH=. python scripts/sft/predict.py dataset=pl-court-instruct model=Bielik-7B-Instruct-v0.1-fine-tuned
-      random_seed=7312 
-      output_file=data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1-fine-tuned/outputs_7312.json
+    - hash: md5
+      md5: 860b5c00ace1f2967db9b5a977cfc3ad
+      path: data/experiments/predict/en-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/judge_metrics_7312.json
+      size: 478
+  evaluate_llm_as_judge_en@gpt_4o_mini-Unsloth-Mistral-Nemo-Instruct-2407-997:
+    cmd: PYTHONPATH=. python scripts/sft/evaluate_llm_as_judge.py api_model=gpt_4o_mini
+      answers_file=data/experiments/predict/en-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/outputs_997.json
+      out_metric_file=data/experiments/predict/en-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/judge_metrics_997.json
+      prompt=en
     deps:
-    - path: configs/model/Bielik-7B-Instruct-v0.1-fine-tuned.yaml
-      hash: md5
-      md5: 2d9590869dfe247d7c6335f3cd7dd0c2
-      size: 253
-    - path: configs/predict.yaml
-      hash: md5
-      md5: 5fc8b9ac571d4a2209d7d866697252ab
-      size: 402
-    - path: scripts/sft/predict.py
-      hash: md5
-      md5: f9acd63cd4d682ae2242d7b51f0d974b
-      size: 3198
+    - hash: md5
+      md5: 94c30cf8fe7db71afc58a5c9cdbc0d9f
+      path: data/experiments/predict/en-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/outputs_997.json
+      size: 705894
+    - hash: md5
+      md5: 79a02fb864cb279f93fc4171043bb31c
+      path: scripts/sft/evaluate_llm_as_judge.py
+      size: 2253
     outs:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1-fine-tuned/outputs_7312.json
-      hash: md5
-      md5: 743ea22448bc73a7a991da075fca8841
-      size: 2031343
-  predict@pl-court-instruct-Bielik-7B-Instruct-v0.1-fine-tuned-997:
-    cmd: PYTHONPATH=. python scripts/sft/predict.py dataset=pl-court-instruct model=Bielik-7B-Instruct-v0.1-fine-tuned
-      random_seed=997 
-      output_file=data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1-fine-tuned/outputs_997.json
+    - hash: md5
+      md5: 860b5c00ace1f2967db9b5a977cfc3ad
+      path: data/experiments/predict/en-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/judge_metrics_997.json
+      size: 478
+  evaluate_llm_as_judge_en@gpt_4o_mini-Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned-en-42:
+    cmd: PYTHONPATH=. python scripts/sft/evaluate_llm_as_judge.py api_model=gpt_4o_mini
+      answers_file=data/experiments/predict/en-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned-en/outputs_42.json
+      out_metric_file=data/experiments/predict/en-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned-en/judge_metrics_42.json
+      prompt=en
     deps:
-    - path: configs/model/Bielik-7B-Instruct-v0.1-fine-tuned.yaml
-      hash: md5
-      md5: 2d9590869dfe247d7c6335f3cd7dd0c2
-      size: 253
-    - path: configs/predict.yaml
-      hash: md5
-      md5: 5fc8b9ac571d4a2209d7d866697252ab
-      size: 402
-    - path: scripts/sft/predict.py
-      hash: md5
-      md5: f9acd63cd4d682ae2242d7b51f0d974b
-      size: 3198
+    - hash: md5
+      md5: 313fa5a662f37cacae4980a04830f422
+      path: data/experiments/predict/en-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned-en/outputs_42.json
+      size: 642688
+    - hash: md5
+      md5: 79a02fb864cb279f93fc4171043bb31c
+      path: scripts/sft/evaluate_llm_as_judge.py
+      size: 2253
     outs:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1-fine-tuned/outputs_997.json
-      hash: md5
-      md5: 433a4b2aa7870a134277a265d099a588
-      size: 2029482
-  sft_unsloth@pl-court-instruct-Unsloth-Llama-3-8B-Instruct:
-    cmd: PYTHONPATH=. python scripts/sft/fine_tune_llm.py dataset=pl-court-instruct
-      model=Unsloth-Llama-3-8B-Instruct
+    - hash: md5
+      md5: 974e972a09d844a77840029d642e8077
+      path: data/experiments/predict/en-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned-en/judge_metrics_42.json
+      size: 486
+  evaluate_llm_as_judge_en@gpt_4o_mini-Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned-en-7312:
+    cmd: PYTHONPATH=. python scripts/sft/evaluate_llm_as_judge.py api_model=gpt_4o_mini
+      answers_file=data/experiments/predict/en-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned-en/outputs_7312.json
+      out_metric_file=data/experiments/predict/en-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned-en/judge_metrics_7312.json
+      prompt=en
     deps:
-    - path: configs/fine_tuning.yaml
-      hash: md5
-      md5: 3933c4faf5a478d0f9d3963c3b29e5cc
-      size: 1356
-    - path: configs/model/Unsloth-Llama-3-8B-Instruct.yaml
-      hash: md5
-      md5: 56a95874b3e77e7ffec11c00330da5b6
-      size: 176
-    - path: scripts/sft/fine_tune_llm.py
-      hash: md5
-      md5: 4b77ee1ea604cae18f17ca00cdb6988b
-      size: 4578
+    - hash: md5
+      md5: 4ed8db93aa14f1cc98e276d3989efa9e
+      path: data/experiments/predict/en-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned-en/outputs_7312.json
+      size: 642730
+    - hash: md5
+      md5: 79a02fb864cb279f93fc4171043bb31c
+      path: scripts/sft/evaluate_llm_as_judge.py
+      size: 2253
     outs:
-    - path: data/experiments/fine-tune/Unsloth-Llama-3-8B-Instruct/pl-court-instruct/
-      hash: md5
-      md5: d9850d30d221f257e1453a66a6c1eef3.dir
-      size: 784320233
-      nfiles: 33
-  sft_unsloth@pl-court-instruct-Unsloth-Mistral-7B-Instruct-v0.3:
-    cmd: PYTHONPATH=. python scripts/sft/fine_tune_llm.py dataset=pl-court-instruct
-      model=Unsloth-Mistral-7B-Instruct-v0.3
+    - hash: md5
+      md5: 8a9712eb10a8da99d86bab8968fd3207
+      path: data/experiments/predict/en-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned-en/judge_metrics_7312.json
+      size: 485
+  evaluate_llm_as_judge_en@gpt_4o_mini-Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned-en-997:
+    cmd: PYTHONPATH=. python scripts/sft/evaluate_llm_as_judge.py api_model=gpt_4o_mini
+      answers_file=data/experiments/predict/en-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned-en/outputs_997.json
+      out_metric_file=data/experiments/predict/en-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned-en/judge_metrics_997.json
+      prompt=en
     deps:
-    - path: configs/fine_tuning.yaml
-      hash: md5
-      md5: 3933c4faf5a478d0f9d3963c3b29e5cc
-      size: 1356
-    - path: configs/model/Unsloth-Mistral-7B-Instruct-v0.3.yaml
-      hash: md5
-      md5: d184e20107315876e7751bdc7c3841ad
-      size: 182
-    - path: scripts/sft/fine_tune_llm.py
-      hash: md5
-      md5: 4b77ee1ea604cae18f17ca00cdb6988b
-      size: 4578
+    - hash: md5
+      md5: 787c129090aa1b64e337b236a4391402
+      path: data/experiments/predict/en-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned-en/outputs_997.json
+      size: 642477
+    - hash: md5
+      md5: 79a02fb864cb279f93fc4171043bb31c
+      path: scripts/sft/evaluate_llm_as_judge.py
+      size: 2253
     outs:
-    - path: data/experiments/fine-tune/Unsloth-Mistral-7B-Instruct-v0.3/pl-court-instruct/
-      hash: md5
-      md5: 1b47e8203c533942e1903dd816f7a7f7.dir
-      size: 1518954466
-      nfiles: 66
-  sft_unsloth@pl-court-instruct-Unsloth-Mistral-Nemo-Instruct-2407:
-    cmd: PYTHONPATH=. python scripts/sft/fine_tune_llm.py dataset=pl-court-instruct
-      model=Unsloth-Mistral-Nemo-Instruct-2407
+    - hash: md5
+      md5: 34de8eabaebe6a96b4b664b664f222e2
+      path: data/experiments/predict/en-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned-en/judge_metrics_997.json
+      size: 484
+  evaluate_llm_as_judge_pl@gpt_4o_mini-Bielik-11B-v2.2-Instruct-42:
+    cmd: PYTHONPATH=. python scripts/sft/evaluate_llm_as_judge.py api_model=gpt_4o_mini
+      answers_file=data/experiments/predict/pl-court-instruct/Bielik-11B-v2.2-Instruct/outputs_42.json
+      out_metric_file=data/experiments/predict/pl-court-instruct/Bielik-11B-v2.2-Instruct/judge_metrics_42.json
+      prompt=pl
     deps:
-    - path: configs/fine_tuning.yaml
-      hash: md5
-      md5: 3933c4faf5a478d0f9d3963c3b29e5cc
-      size: 1356
-    - path: configs/model/Unsloth-Mistral-Nemo-Instruct-2407.yaml
-      hash: md5
-      md5: ca5ac52e503c9f488f98f569811c76dc
-      size: 261
-    - path: scripts/sft/fine_tune_llm.py
-      hash: md5
-      md5: 4b77ee1ea604cae18f17ca00cdb6988b
-      size: 4578
+    - hash: md5
+      md5: c3e404c898e3e193ac3aa910187b4f9f
+      path: data/experiments/predict/pl-court-instruct/Bielik-11B-v2.2-Instruct/outputs_42.json
+      size: 1734129
+    - hash: md5
+      md5: 79a02fb864cb279f93fc4171043bb31c
+      path: scripts/sft/evaluate_llm_as_judge.py
+      size: 2253
     outs:
-    - path: data/experiments/fine-tune/Unsloth-Mistral-Nemo-Instruct-2407/pl-court-instruct/
-      hash: md5
-      md5: 80bceb56982e9bdb8d4b441bf843014f.dir
-      size: 1056899473
-      nfiles: 33
-  sft_unsloth@pl-court-instruct-Bielik-7B-Instruct-v0.1:
-    cmd: PYTHONPATH=. python scripts/sft/fine_tune_llm.py dataset=pl-court-instruct
-      model=Bielik-7B-Instruct-v0.1
+    - hash: md5
+      md5: 198f24599357bc230bf9f1e39a235a44
+      path: data/experiments/predict/pl-court-instruct/Bielik-11B-v2.2-Instruct/judge_metrics_42.json
+      size: 1172
+  evaluate_llm_as_judge_pl@gpt_4o_mini-Bielik-11B-v2.2-Instruct-7312:
+    cmd: PYTHONPATH=. python scripts/sft/evaluate_llm_as_judge.py api_model=gpt_4o_mini
+      answers_file=data/experiments/predict/pl-court-instruct/Bielik-11B-v2.2-Instruct/outputs_7312.json
+      out_metric_file=data/experiments/predict/pl-court-instruct/Bielik-11B-v2.2-Instruct/judge_metrics_7312.json
+      prompt=pl
     deps:
-    - path: configs/fine_tuning.yaml
-      hash: md5
-      md5: 3933c4faf5a478d0f9d3963c3b29e5cc
-      size: 1356
-    - path: configs/model/Bielik-7B-Instruct-v0.1.yaml
-      hash: md5
-      md5: c3412525e9819b53fbad06363a07a871
-      size: 173
-    - path: scripts/sft/fine_tune_llm.py
-      hash: md5
-      md5: 4b77ee1ea604cae18f17ca00cdb6988b
-      size: 4578
+    - hash: md5
+      md5: d4a2ab2393a58f0d7e1897859eccb626
+      path: data/experiments/predict/pl-court-instruct/Bielik-11B-v2.2-Instruct/outputs_7312.json
+      size: 1734772
+    - hash: md5
+      md5: 79a02fb864cb279f93fc4171043bb31c
+      path: scripts/sft/evaluate_llm_as_judge.py
+      size: 2253
     outs:
-    - path: data/experiments/fine-tune/Bielik-7B-Instruct-v0.1/pl-court-instruct/
-      hash: md5
-      md5: be61ab5ea1365c1bcf908952bc015ab4.dir
-      size: 2293711014
-      nfiles: 108
-  predict@en-court-instruct-Unsloth-Llama-3-8B-Instruct-7312:
-    cmd: PYTHONPATH=. python scripts/sft/predict.py dataset=en-court-instruct model=Unsloth-Llama-3-8B-Instruct
-      random_seed=7312 
-      output_file=data/experiments/predict/en-court-instruct/Unsloth-Llama-3-8B-Instruct/outputs_7312.json
+    - hash: md5
+      md5: 81cfdaa675ef2118cf923e57cc54d201
+      path: data/experiments/predict/pl-court-instruct/Bielik-11B-v2.2-Instruct/judge_metrics_7312.json
+      size: 1161
+  evaluate_llm_as_judge_pl@gpt_4o_mini-Bielik-11B-v2.2-Instruct-997:
+    cmd: PYTHONPATH=. python scripts/sft/evaluate_llm_as_judge.py api_model=gpt_4o_mini
+      answers_file=data/experiments/predict/pl-court-instruct/Bielik-11B-v2.2-Instruct/outputs_997.json
+      out_metric_file=data/experiments/predict/pl-court-instruct/Bielik-11B-v2.2-Instruct/judge_metrics_997.json
+      prompt=pl
     deps:
-    - path: configs/model/Unsloth-Llama-3-8B-Instruct.yaml
-      hash: md5
-      md5: 56a95874b3e77e7ffec11c00330da5b6
-      size: 176
-    - path: configs/predict.yaml
-      hash: md5
-      md5: 5fc8b9ac571d4a2209d7d866697252ab
-      size: 402
-    - path: scripts/sft/predict.py
-      hash: md5
-      md5: f9acd63cd4d682ae2242d7b51f0d974b
-      size: 3198
+    - hash: md5
+      md5: 8f4f6bc97e33b3b2728bebb7620a4968
+      path: data/experiments/predict/pl-court-instruct/Bielik-11B-v2.2-Instruct/outputs_997.json
+      size: 1731689
+    - hash: md5
+      md5: 79a02fb864cb279f93fc4171043bb31c
+      path: scripts/sft/evaluate_llm_as_judge.py
+      size: 2253
     outs:
-    - path: 
-        data/experiments/predict/en-court-instruct/Unsloth-Llama-3-8B-Instruct/outputs_7312.json
-      hash: md5
-      md5: a7361535b440251d6ce6232a15cfcdf2
-      size: 818877
-  predict@en-court-instruct-Unsloth-Mistral-Nemo-Instruct-2407-997:
-    cmd: PYTHONPATH=. python scripts/sft/predict.py dataset=en-court-instruct model=Unsloth-Mistral-Nemo-Instruct-2407
-      random_seed=997 
-      output_file=data/experiments/predict/en-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/outputs_997.json
+    - hash: md5
+      md5: c5861ffaa439ba9bbd95b954d6ab1f3d
+      path: data/experiments/predict/pl-court-instruct/Bielik-11B-v2.2-Instruct/judge_metrics_997.json
+      size: 1168
+  evaluate_llm_as_judge_pl@gpt_4o_mini-Bielik-11B-v2.2-Instruct-fine-tuned-42:
+    cmd: PYTHONPATH=. python scripts/sft/evaluate_llm_as_judge.py api_model=gpt_4o_mini
+      answers_file=data/experiments/predict/pl-court-instruct/Bielik-11B-v2.2-Instruct-fine-tuned/outputs_42.json
+      out_metric_file=data/experiments/predict/pl-court-instruct/Bielik-11B-v2.2-Instruct-fine-tuned/judge_metrics_42.json
+      prompt=pl
     deps:
-    - path: configs/model/Unsloth-Mistral-Nemo-Instruct-2407.yaml
-      hash: md5
-      md5: ca5ac52e503c9f488f98f569811c76dc
-      size: 261
-    - path: configs/predict.yaml
-      hash: md5
-      md5: 5fc8b9ac571d4a2209d7d866697252ab
-      size: 402
-    - path: scripts/sft/predict.py
-      hash: md5
-      md5: f9acd63cd4d682ae2242d7b51f0d974b
-      size: 3198
+    - hash: md5
+      md5: dfd5d7389b312686428cc967aea5a5b9
+      path: data/experiments/predict/pl-court-instruct/Bielik-11B-v2.2-Instruct-fine-tuned/outputs_42.json
+      size: 1860743
+    - hash: md5
+      md5: 79a02fb864cb279f93fc4171043bb31c
+      path: scripts/sft/evaluate_llm_as_judge.py
+      size: 2253
     outs:
-    - path: 
-        data/experiments/predict/en-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/outputs_997.json
-      hash: md5
-      md5: 94c30cf8fe7db71afc58a5c9cdbc0d9f
-      size: 705894
-  predict@en-court-instruct-Unsloth-Llama-3-8B-Instruct-997:
-    cmd: PYTHONPATH=. python scripts/sft/predict.py dataset=en-court-instruct model=Unsloth-Llama-3-8B-Instruct
-      random_seed=997 
-      output_file=data/experiments/predict/en-court-instruct/Unsloth-Llama-3-8B-Instruct/outputs_997.json
+    - hash: md5
+      md5: abcd5722e84ec3e81ff8cf28b8a887cb
+      path: data/experiments/predict/pl-court-instruct/Bielik-11B-v2.2-Instruct-fine-tuned/judge_metrics_42.json
+      size: 1165
+  evaluate_llm_as_judge_pl@gpt_4o_mini-Bielik-11B-v2.2-Instruct-fine-tuned-7312:
+    cmd: PYTHONPATH=. python scripts/sft/evaluate_llm_as_judge.py api_model=gpt_4o_mini
+      answers_file=data/experiments/predict/pl-court-instruct/Bielik-11B-v2.2-Instruct-fine-tuned/outputs_7312.json
+      out_metric_file=data/experiments/predict/pl-court-instruct/Bielik-11B-v2.2-Instruct-fine-tuned/judge_metrics_7312.json
+      prompt=pl
     deps:
-    - path: configs/model/Unsloth-Llama-3-8B-Instruct.yaml
-      hash: md5
-      md5: 56a95874b3e77e7ffec11c00330da5b6
-      size: 176
-    - path: configs/predict.yaml
-      hash: md5
-      md5: 5fc8b9ac571d4a2209d7d866697252ab
-      size: 402
-    - path: scripts/sft/predict.py
-      hash: md5
-      md5: f9acd63cd4d682ae2242d7b51f0d974b
-      size: 3198
+    - hash: md5
+      md5: 8fa2faeda5a577c06cd6bf35b8702330
+      path: data/experiments/predict/pl-court-instruct/Bielik-11B-v2.2-Instruct-fine-tuned/outputs_7312.json
+      size: 1857569
+    - hash: md5
+      md5: 79a02fb864cb279f93fc4171043bb31c
+      path: scripts/sft/evaluate_llm_as_judge.py
+      size: 2253
     outs:
-    - path: 
-        data/experiments/predict/en-court-instruct/Unsloth-Llama-3-8B-Instruct/outputs_997.json
-      hash: md5
-      md5: 94924275d576271875fecf22c0f9b39e
-      size: 817490
-  sft_unsloth@en-court-instruct-Unsloth-Mistral-Nemo-Instruct-2407:
-    cmd: PYTHONPATH=. python scripts/sft/fine_tune_llm.py dataset=en-court-instruct
-      model=Unsloth-Mistral-Nemo-Instruct-2407
+    - hash: md5
+      md5: 4b77a3d10cd6027e7e141ba80e9678c2
+      path: data/experiments/predict/pl-court-instruct/Bielik-11B-v2.2-Instruct-fine-tuned/judge_metrics_7312.json
+      size: 1160
+  evaluate_llm_as_judge_pl@gpt_4o_mini-Bielik-11B-v2.2-Instruct-fine-tuned-997:
+    cmd: PYTHONPATH=. python scripts/sft/evaluate_llm_as_judge.py api_model=gpt_4o_mini
+      answers_file=data/experiments/predict/pl-court-instruct/Bielik-11B-v2.2-Instruct-fine-tuned/outputs_997.json
+      out_metric_file=data/experiments/predict/pl-court-instruct/Bielik-11B-v2.2-Instruct-fine-tuned/judge_metrics_997.json
+      prompt=pl
     deps:
-    - path: configs/fine_tuning.yaml
-      hash: md5
-      md5: 3933c4faf5a478d0f9d3963c3b29e5cc
-      size: 1356
-    - path: configs/model/Unsloth-Mistral-Nemo-Instruct-2407.yaml
-      hash: md5
-      md5: ca5ac52e503c9f488f98f569811c76dc
-      size: 261
-    - path: scripts/sft/fine_tune_llm.py
-      hash: md5
-      md5: 4b77ee1ea604cae18f17ca00cdb6988b
-      size: 4578
+    - hash: md5
+      md5: ba53d76f701eddb60a182de49d992878
+      path: data/experiments/predict/pl-court-instruct/Bielik-11B-v2.2-Instruct-fine-tuned/outputs_997.json
+      size: 1857855
+    - hash: md5
+      md5: 79a02fb864cb279f93fc4171043bb31c
+      path: scripts/sft/evaluate_llm_as_judge.py
+      size: 2253
     outs:
-    - path: data/experiments/fine-tune/Unsloth-Mistral-Nemo-Instruct-2407/en-court-instruct/
-      hash: md5
-      md5: 4c4f973ee0648610fc4b696059fae47a.dir
-      size: 475726484
-      nfiles: 18
-  predict@en-court-instruct-Unsloth-Mistral-Nemo-Instruct-2407-7312:
-    cmd: PYTHONPATH=. python scripts/sft/predict.py dataset=en-court-instruct model=Unsloth-Mistral-Nemo-Instruct-2407
-      random_seed=7312 
-      output_file=data/experiments/predict/en-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/outputs_7312.json
+    - hash: md5
+      md5: 9e60a1ed6002a0349656c0bd23bc7b1c
+      path: data/experiments/predict/pl-court-instruct/Bielik-11B-v2.2-Instruct-fine-tuned/judge_metrics_997.json
+      size: 1164
+  evaluate_llm_as_judge_pl@gpt_4o_mini-Bielik-7B-Instruct-v0.1-42:
+    cmd: PYTHONPATH=. python scripts/sft/evaluate_llm_as_judge.py api_model=gpt_4o_mini
+      answers_file=data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1/outputs_42.json
+      out_metric_file=data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1/judge_metrics_42.json
+      prompt=pl
     deps:
-    - path: configs/model/Unsloth-Mistral-Nemo-Instruct-2407.yaml
-      hash: md5
-      md5: ca5ac52e503c9f488f98f569811c76dc
-      size: 261
-    - path: configs/predict.yaml
-      hash: md5
-      md5: 5fc8b9ac571d4a2209d7d866697252ab
-      size: 402
-    - path: scripts/sft/predict.py
-      hash: md5
-      md5: f9acd63cd4d682ae2242d7b51f0d974b
-      size: 3198
+    - hash: md5
+      md5: 2dc39513a04910c5d0c54380166639d9
+      path: data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1/outputs_42.json
+      size: 2029644
+    - hash: md5
+      md5: 79a02fb864cb279f93fc4171043bb31c
+      path: scripts/sft/evaluate_llm_as_judge.py
+      size: 2253
     outs:
-    - path: 
-        data/experiments/predict/en-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/outputs_7312.json
-      hash: md5
-      md5: cf4fdbf0e26e6c793bdca4edd6e365c0
-      size: 703876
-  predict@en-court-instruct-Unsloth-Llama-3-8B-Instruct-42:
-    cmd: PYTHONPATH=. python scripts/sft/predict.py dataset=en-court-instruct model=Unsloth-Llama-3-8B-Instruct
-      random_seed=42 
-      output_file=data/experiments/predict/en-court-instruct/Unsloth-Llama-3-8B-Instruct/outputs_42.json
+    - hash: md5
+      md5: 243da4df07c6dfb5199b925e3f5c07aa
+      path: data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1/judge_metrics_42.json
+      size: 1137
+  evaluate_llm_as_judge_pl@gpt_4o_mini-Bielik-7B-Instruct-v0.1-7312:
+    cmd: PYTHONPATH=. python scripts/sft/evaluate_llm_as_judge.py api_model=gpt_4o_mini
+      answers_file=data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1/outputs_7312.json
+      out_metric_file=data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1/judge_metrics_7312.json
+      prompt=pl
     deps:
-    - path: configs/model/Unsloth-Llama-3-8B-Instruct.yaml
-      hash: md5
-      md5: 56a95874b3e77e7ffec11c00330da5b6
-      size: 176
-    - path: configs/predict.yaml
-      hash: md5
-      md5: 5fc8b9ac571d4a2209d7d866697252ab
-      size: 402
-    - path: scripts/sft/predict.py
-      hash: md5
-      md5: f9acd63cd4d682ae2242d7b51f0d974b
-      size: 3198
+    - hash: md5
+      md5: ae39bf31296ffe82c0f6a3e8c9ff63aa
+      path: data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1/outputs_7312.json
+      size: 2014399
+    - hash: md5
+      md5: 79a02fb864cb279f93fc4171043bb31c
+      path: scripts/sft/evaluate_llm_as_judge.py
+      size: 2253
     outs:
-    - path: 
-        data/experiments/predict/en-court-instruct/Unsloth-Llama-3-8B-Instruct/outputs_42.json
-      hash: md5
-      md5: 761018c0a306fbee63dad2fbc119110d
-      size: 821683
-  predict@pl-court-instruct-trurl-13B-academic-7312:
-    cmd: PYTHONPATH=. python scripts/sft/predict.py dataset=pl-court-instruct model=trurl-13B-academic
-      random_seed=7312 
-      output_file=data/experiments/predict/pl-court-instruct/trurl-13B-academic/outputs_7312.json
+    - hash: md5
+      md5: 8098cc937d57455ca47d32c3449159a3
+      path: data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1/judge_metrics_7312.json
+      size: 1129
+  evaluate_llm_as_judge_pl@gpt_4o_mini-Bielik-7B-Instruct-v0.1-997:
+    cmd: PYTHONPATH=. python scripts/sft/evaluate_llm_as_judge.py api_model=gpt_4o_mini
+      answers_file=data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1/outputs_997.json
+      out_metric_file=data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1/judge_metrics_997.json
+      prompt=pl
     deps:
-    - path: configs/model/trurl-13B-academic.yaml
-      hash: md5
-      md5: 3aa3ce4fc9a1958bef82b8dbfd44ab6b
-      size: 168
-    - path: configs/predict.yaml
-      hash: md5
-      md5: 5fc8b9ac571d4a2209d7d866697252ab
-      size: 402
-    - path: scripts/sft/predict.py
-      hash: md5
-      md5: f9acd63cd4d682ae2242d7b51f0d974b
-      size: 3198
+    - hash: md5
+      md5: fac04d78ad020b50f79fc7277a037e8e
+      path: data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1/outputs_997.json
+      size: 2016400
+    - hash: md5
+      md5: 79a02fb864cb279f93fc4171043bb31c
+      path: scripts/sft/evaluate_llm_as_judge.py
+      size: 2253
     outs:
-    - path: data/experiments/predict/pl-court-instruct/trurl-13B-academic/outputs_7312.json
-      hash: md5
-      md5: bcd41ca4629d4cec2440a8ed2f02560f
-      size: 1283974
-  predict@en-court-instruct-Unsloth-Mistral-Nemo-Instruct-2407-42:
-    cmd: PYTHONPATH=. python scripts/sft/predict.py dataset=en-court-instruct model=Unsloth-Mistral-Nemo-Instruct-2407
-      random_seed=42 
-      output_file=data/experiments/predict/en-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/outputs_42.json
+    - hash: md5
+      md5: f1390b2d50893a17c90fc277dc363d6a
+      path: data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1/judge_metrics_997.json
+      size: 1139
+  evaluate_llm_as_judge_pl@gpt_4o_mini-Bielik-7B-Instruct-v0.1-fine-tuned-42:
+    cmd: PYTHONPATH=. python scripts/sft/evaluate_llm_as_judge.py api_model=gpt_4o_mini
+      answers_file=data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1-fine-tuned/outputs_42.json
+      out_metric_file=data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1-fine-tuned/judge_metrics_42.json
+      prompt=pl
     deps:
-    - path: configs/model/Unsloth-Mistral-Nemo-Instruct-2407.yaml
-      hash: md5
-      md5: ca5ac52e503c9f488f98f569811c76dc
-      size: 261
-    - path: configs/predict.yaml
-      hash: md5
-      md5: 5fc8b9ac571d4a2209d7d866697252ab
-      size: 402
-    - path: scripts/sft/predict.py
-      hash: md5
-      md5: f9acd63cd4d682ae2242d7b51f0d974b
-      size: 3198
+    - hash: md5
+      md5: 178eb0649617d4a698da6c9e315e84c5
+      path: data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1-fine-tuned/outputs_42.json
+      size: 2034749
+    - hash: md5
+      md5: 79a02fb864cb279f93fc4171043bb31c
+      path: scripts/sft/evaluate_llm_as_judge.py
+      size: 2253
     outs:
-    - path: 
-        data/experiments/predict/en-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/outputs_42.json
-      hash: md5
-      md5: 4fe25ad80a20ea5d6200136176b3e4ca
-      size: 705218
-  predict@pl-court-instruct-trurl-13B-academic-997:
-    cmd: PYTHONPATH=. python scripts/sft/predict.py dataset=pl-court-instruct model=trurl-13B-academic
-      random_seed=997 
-      output_file=data/experiments/predict/pl-court-instruct/trurl-13B-academic/outputs_997.json
+    - hash: md5
+      md5: 302b957707520fa327d1da0edf18baa3
+      path: data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1-fine-tuned/judge_metrics_42.json
+      size: 1167
+  evaluate_llm_as_judge_pl@gpt_4o_mini-Bielik-7B-Instruct-v0.1-fine-tuned-7312:
+    cmd: PYTHONPATH=. python scripts/sft/evaluate_llm_as_judge.py api_model=gpt_4o_mini
+      answers_file=data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1-fine-tuned/outputs_7312.json
+      out_metric_file=data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1-fine-tuned/judge_metrics_7312.json
+      prompt=pl
     deps:
-    - path: configs/model/trurl-13B-academic.yaml
-      hash: md5
-      md5: 3aa3ce4fc9a1958bef82b8dbfd44ab6b
-      size: 168
-    - path: configs/predict.yaml
-      hash: md5
-      md5: 5fc8b9ac571d4a2209d7d866697252ab
-      size: 402
-    - path: scripts/sft/predict.py
-      hash: md5
-      md5: f9acd63cd4d682ae2242d7b51f0d974b
-      size: 3198
+    - hash: md5
+      md5: 743ea22448bc73a7a991da075fca8841
+      path: data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1-fine-tuned/outputs_7312.json
+      size: 2031343
+    - hash: md5
+      md5: 79a02fb864cb279f93fc4171043bb31c
+      path: scripts/sft/evaluate_llm_as_judge.py
+      size: 2253
     outs:
-    - path: data/experiments/predict/pl-court-instruct/trurl-13B-academic/outputs_997.json
-      hash: md5
-      md5: 731cff0eb1484682de211336efeff153
-      size: 1288941
-  predict@pl-court-instruct-trurl-13B-academic-42:
-    cmd: PYTHONPATH=. python scripts/sft/predict.py dataset=pl-court-instruct model=trurl-13B-academic
-      random_seed=42 
-      output_file=data/experiments/predict/pl-court-instruct/trurl-13B-academic/outputs_42.json
+    - hash: md5
+      md5: 789f0906846251d3f0cab78d111f9c56
+      path: data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1-fine-tuned/judge_metrics_7312.json
+      size: 1163
+  evaluate_llm_as_judge_pl@gpt_4o_mini-Bielik-7B-Instruct-v0.1-fine-tuned-997:
+    cmd: PYTHONPATH=. python scripts/sft/evaluate_llm_as_judge.py api_model=gpt_4o_mini
+      answers_file=data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1-fine-tuned/outputs_997.json
+      out_metric_file=data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1-fine-tuned/judge_metrics_997.json
+      prompt=pl
     deps:
-    - path: configs/model/trurl-13B-academic.yaml
-      hash: md5
-      md5: 3aa3ce4fc9a1958bef82b8dbfd44ab6b
-      size: 168
-    - path: configs/predict.yaml
-      hash: md5
-      md5: 5fc8b9ac571d4a2209d7d866697252ab
-      size: 402
-    - path: scripts/sft/predict.py
-      hash: md5
-      md5: f9acd63cd4d682ae2242d7b51f0d974b
-      size: 3198
+    - hash: md5
+      md5: 433a4b2aa7870a134277a265d099a588
+      path: data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1-fine-tuned/outputs_997.json
+      size: 2029482
+    - hash: md5
+      md5: 79a02fb864cb279f93fc4171043bb31c
+      path: scripts/sft/evaluate_llm_as_judge.py
+      size: 2253
     outs:
-    - path: data/experiments/predict/pl-court-instruct/trurl-13B-academic/outputs_42.json
-      hash: md5
-      md5: bb571102170940efc73f02143a530d5b
-      size: 1289839
-  evaluate@pl-court-instruct-Unsloth-Llama-3-8B-Instruct-42:
-    cmd: PYTHONPATH=. python scripts/sft/evaluate.py --output-file 
-      data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct/outputs_42.json
-      --num-proc=-1
+    - hash: md5
+      md5: 90f3ed04ef29c5cd29b7ec8f02a780a1
+      path: data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1-fine-tuned/judge_metrics_997.json
+      size: 1163
+  evaluate_llm_as_judge_pl@gpt_4o_mini-Unsloth-Llama-3-8B-Instruct-42:
+    cmd: PYTHONPATH=. python scripts/sft/evaluate_llm_as_judge.py api_model=gpt_4o_mini
+      answers_file=data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct/outputs_42.json
+      out_metric_file=data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct/judge_metrics_42.json
+      prompt=pl
     deps:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct/outputs_42.json
-      hash: md5
+    - hash: md5
       md5: e99c88720116c951087b6125e5f4be4d
+      path: data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct/outputs_42.json
       size: 2008073
-    - path: scripts/sft/evaluate.py
-      hash: md5
-      md5: 0644efb76af2c5461185e37a07ba2c17
-      size: 697
+    - hash: md5
+      md5: 79a02fb864cb279f93fc4171043bb31c
+      path: scripts/sft/evaluate_llm_as_judge.py
+      size: 2253
     outs:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct/metrics_42.json
-      hash: md5
-      md5: 2116481b79c785f94b35852b6e0e4f57
-      size: 304
-  evaluate@pl-court-instruct-Unsloth-Llama-3-8B-Instruct-7312:
-    cmd: PYTHONPATH=. python scripts/sft/evaluate.py --output-file 
-      data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct/outputs_7312.json
-      --num-proc=-1
+    - hash: md5
+      md5: 9d9fba0cf2169e9dd9f69579a2182b8e
+      path: data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct/judge_metrics_42.json
+      size: 1172
+  evaluate_llm_as_judge_pl@gpt_4o_mini-Unsloth-Llama-3-8B-Instruct-7312:
+    cmd: PYTHONPATH=. python scripts/sft/evaluate_llm_as_judge.py api_model=gpt_4o_mini
+      answers_file=data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct/outputs_7312.json
+      out_metric_file=data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct/judge_metrics_7312.json
+      prompt=pl
     deps:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct/outputs_7312.json
-      hash: md5
+    - hash: md5
       md5: 4c25368aacb7402b1b2cae9368d187d1
+      path: data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct/outputs_7312.json
       size: 2013637
-    - path: scripts/sft/evaluate.py
-      hash: md5
-      md5: 0644efb76af2c5461185e37a07ba2c17
-      size: 697
+    - hash: md5
+      md5: 79a02fb864cb279f93fc4171043bb31c
+      path: scripts/sft/evaluate_llm_as_judge.py
+      size: 2253
     outs:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct/metrics_7312.json
-      hash: md5
-      md5: 5e851a38b322caff59de90004eb4a075
-      size: 305
-  evaluate@pl-court-instruct-Unsloth-Llama-3-8B-Instruct-997:
-    cmd: PYTHONPATH=. python scripts/sft/evaluate.py --output-file 
-      data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct/outputs_997.json
-      --num-proc=-1
+    - hash: md5
+      md5: e58171fc082d33c84497a13dabcf766c
+      path: data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct/judge_metrics_7312.json
+      size: 1167
+  evaluate_llm_as_judge_pl@gpt_4o_mini-Unsloth-Llama-3-8B-Instruct-997:
+    cmd: PYTHONPATH=. python scripts/sft/evaluate_llm_as_judge.py api_model=gpt_4o_mini
+      answers_file=data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct/outputs_997.json
+      out_metric_file=data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct/judge_metrics_997.json
+      prompt=pl
     deps:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct/outputs_997.json
-      hash: md5
+    - hash: md5
       md5: baef589507248af212aaae51602fd999
+      path: data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct/outputs_997.json
       size: 2010150
-    - path: scripts/sft/evaluate.py
-      hash: md5
-      md5: 0644efb76af2c5461185e37a07ba2c17
-      size: 697
+    - hash: md5
+      md5: 79a02fb864cb279f93fc4171043bb31c
+      path: scripts/sft/evaluate_llm_as_judge.py
+      size: 2253
     outs:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct/metrics_997.json
-      hash: md5
-      md5: 8c3af9851700f2ff640dd9c8dc92b06d
-      size: 307
-  evaluate@pl-court-instruct-Unsloth-Llama-3-8B-Instruct-fine-tuned-42:
-    cmd: PYTHONPATH=. python scripts/sft/evaluate.py --output-file 
-      data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct-fine-tuned/outputs_42.json
-      --num-proc=-1
+    - hash: md5
+      md5: f8d16a5298fabe288486822779470cd8
+      path: data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct/judge_metrics_997.json
+      size: 1165
+  evaluate_llm_as_judge_pl@gpt_4o_mini-Unsloth-Llama-3-8B-Instruct-fine-tuned-42:
+    cmd: PYTHONPATH=. python scripts/sft/evaluate_llm_as_judge.py api_model=gpt_4o_mini
+      answers_file=data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct-fine-tuned/outputs_42.json
+      out_metric_file=data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct-fine-tuned/judge_metrics_42.json
+      prompt=pl
     deps:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct-fine-tuned/outputs_42.json
-      hash: md5
+    - hash: md5
       md5: 289b719e8c7166e578417e5706bdc4e3
+      path: data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct-fine-tuned/outputs_42.json
       size: 1760355
-    - path: scripts/sft/evaluate.py
-      hash: md5
-      md5: 0644efb76af2c5461185e37a07ba2c17
-      size: 697
+    - hash: md5
+      md5: 79a02fb864cb279f93fc4171043bb31c
+      path: scripts/sft/evaluate_llm_as_judge.py
+      size: 2253
     outs:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct-fine-tuned/metrics_42.json
-      hash: md5
-      md5: 35fc5163dfb37097b814afcc79e91074
-      size: 304
-  evaluate@pl-court-instruct-Unsloth-Llama-3-8B-Instruct-fine-tuned-7312:
-    cmd: PYTHONPATH=. python scripts/sft/evaluate.py --output-file 
-      data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct-fine-tuned/outputs_7312.json
-      --num-proc=-1
+    - hash: md5
+      md5: 70398042d030309e7e0bc7ba927136f3
+      path: data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct-fine-tuned/judge_metrics_42.json
+      size: 1167
+  evaluate_llm_as_judge_pl@gpt_4o_mini-Unsloth-Llama-3-8B-Instruct-fine-tuned-7312:
+    cmd: PYTHONPATH=. python scripts/sft/evaluate_llm_as_judge.py api_model=gpt_4o_mini
+      answers_file=data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct-fine-tuned/outputs_7312.json
+      out_metric_file=data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct-fine-tuned/judge_metrics_7312.json
+      prompt=pl
     deps:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct-fine-tuned/outputs_7312.json
-      hash: md5
+    - hash: md5
       md5: 25bee3b4ee09b36d636095b4c927a0d3
+      path: data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct-fine-tuned/outputs_7312.json
       size: 1759194
-    - path: scripts/sft/evaluate.py
-      hash: md5
-      md5: 0644efb76af2c5461185e37a07ba2c17
-      size: 697
+    - hash: md5
+      md5: 79a02fb864cb279f93fc4171043bb31c
+      path: scripts/sft/evaluate_llm_as_judge.py
+      size: 2253
     outs:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct-fine-tuned/metrics_7312.json
-      hash: md5
-      md5: f7c13c964cc9e225fa794935cbf6515c
-      size: 305
-  evaluate@pl-court-instruct-Unsloth-Llama-3-8B-Instruct-fine-tuned-997:
-    cmd: PYTHONPATH=. python scripts/sft/evaluate.py --output-file 
-      data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct-fine-tuned/outputs_997.json
-      --num-proc=-1
+    - hash: md5
+      md5: 9d22089c8d23bbc5a028c748e5522c23
+      path: data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct-fine-tuned/judge_metrics_7312.json
+      size: 1157
+  evaluate_llm_as_judge_pl@gpt_4o_mini-Unsloth-Llama-3-8B-Instruct-fine-tuned-997:
+    cmd: PYTHONPATH=. python scripts/sft/evaluate_llm_as_judge.py api_model=gpt_4o_mini
+      answers_file=data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct-fine-tuned/outputs_997.json
+      out_metric_file=data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct-fine-tuned/judge_metrics_997.json
+      prompt=pl
     deps:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct-fine-tuned/outputs_997.json
-      hash: md5
+    - hash: md5
       md5: 82b2c535d99d91b9a34986375bfa31a9
+      path: data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct-fine-tuned/outputs_997.json
       size: 1758747
-    - path: scripts/sft/evaluate.py
-      hash: md5
-      md5: 0644efb76af2c5461185e37a07ba2c17
-      size: 697
+    - hash: md5
+      md5: 79a02fb864cb279f93fc4171043bb31c
+      path: scripts/sft/evaluate_llm_as_judge.py
+      size: 2253
     outs:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct-fine-tuned/metrics_997.json
-      hash: md5
-      md5: 68f0244a7871bae1e8bd0642a0f2c22e
-      size: 305
-  evaluate@pl-court-instruct-Unsloth-Mistral-7B-Instruct-v0.3-42:
-    cmd: PYTHONPATH=. python scripts/sft/evaluate.py --output-file 
-      data/experiments/predict/pl-court-instruct/Unsloth-Mistral-7B-Instruct-v0.3/outputs_42.json
-      --num-proc=-1
+    - hash: md5
+      md5: 4222d5b165de8a3a89d71d6519b71b76
+      path: data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct-fine-tuned/judge_metrics_997.json
+      size: 1170
+  evaluate_llm_as_judge_pl@gpt_4o_mini-Unsloth-Mistral-Nemo-Instruct-2407-42:
+    cmd: PYTHONPATH=. python scripts/sft/evaluate_llm_as_judge.py api_model=gpt_4o_mini
+      answers_file=data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/outputs_42.json
+      out_metric_file=data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/judge_metrics_42.json
+      prompt=pl
     deps:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Unsloth-Mistral-7B-Instruct-v0.3/outputs_42.json
-      hash: md5
-      md5: 4a9d3a2bb1dd47a732bd2df8102bc93f
-      size: 1799957
-    - path: scripts/sft/evaluate.py
-      hash: md5
-      md5: 0644efb76af2c5461185e37a07ba2c17
-      size: 697
+    - hash: md5
+      md5: 1385f49966e9db2a88a17f53d0887ad8
+      path: data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/outputs_42.json
+      size: 1741944
+    - hash: md5
+      md5: 79a02fb864cb279f93fc4171043bb31c
+      path: scripts/sft/evaluate_llm_as_judge.py
+      size: 2253
     outs:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Unsloth-Mistral-7B-Instruct-v0.3/metrics_42.json
-      hash: md5
-      md5: 703e92a1c58aca701b128fd28f4697a4
-      size: 306
-  evaluate@pl-court-instruct-Unsloth-Mistral-7B-Instruct-v0.3-7312:
-    cmd: PYTHONPATH=. python scripts/sft/evaluate.py --output-file 
-      data/experiments/predict/pl-court-instruct/Unsloth-Mistral-7B-Instruct-v0.3/outputs_7312.json
-      --num-proc=-1
+    - hash: md5
+      md5: f4bac633a65afde9bf5612f35c3089bb
+      path: data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/judge_metrics_42.json
+      size: 1170
+  evaluate_llm_as_judge_pl@gpt_4o_mini-Unsloth-Mistral-Nemo-Instruct-2407-7312:
+    cmd: PYTHONPATH=. python scripts/sft/evaluate_llm_as_judge.py api_model=gpt_4o_mini
+      answers_file=data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/outputs_7312.json
+      out_metric_file=data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/judge_metrics_7312.json
+      prompt=pl
     deps:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Unsloth-Mistral-7B-Instruct-v0.3/outputs_7312.json
-      hash: md5
-      md5: 77e10dd2ec17e12e171e4bcab1a48e08
-      size: 1795629
-    - path: scripts/sft/evaluate.py
-      hash: md5
-      md5: 0644efb76af2c5461185e37a07ba2c17
-      size: 697
+    - hash: md5
+      md5: 924744efce1483e9128579cad7a4454c
+      path: data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/outputs_7312.json
+      size: 1748772
+    - hash: md5
+      md5: 79a02fb864cb279f93fc4171043bb31c
+      path: scripts/sft/evaluate_llm_as_judge.py
+      size: 2253
     outs:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Unsloth-Mistral-7B-Instruct-v0.3/metrics_7312.json
-      hash: md5
-      md5: 6f3c13385fefb9e38f01a42bb210e3f5
-      size: 309
-  evaluate@pl-court-instruct-Unsloth-Mistral-7B-Instruct-v0.3-997:
-    cmd: PYTHONPATH=. python scripts/sft/evaluate.py --output-file 
-      data/experiments/predict/pl-court-instruct/Unsloth-Mistral-7B-Instruct-v0.3/outputs_997.json
-      --num-proc=-1
+    - hash: md5
+      md5: 1f95777ef87a547fa7a41dc597adfc39
+      path: data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/judge_metrics_7312.json
+      size: 1166
+  evaluate_llm_as_judge_pl@gpt_4o_mini-Unsloth-Mistral-Nemo-Instruct-2407-997:
+    cmd: PYTHONPATH=. python scripts/sft/evaluate_llm_as_judge.py api_model=gpt_4o_mini
+      answers_file=data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/outputs_997.json
+      out_metric_file=data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/judge_metrics_997.json
+      prompt=pl
     deps:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Unsloth-Mistral-7B-Instruct-v0.3/outputs_997.json
-      hash: md5
-      md5: bbb883aa388b274bef3e9296df26f68f
-      size: 1795752
-    - path: scripts/sft/evaluate.py
-      hash: md5
-      md5: 0644efb76af2c5461185e37a07ba2c17
-      size: 697
+    - hash: md5
+      md5: 4d023797a9053fd7df61f6b1796112e9
+      path: data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/outputs_997.json
+      size: 1747404
+    - hash: md5
+      md5: 79a02fb864cb279f93fc4171043bb31c
+      path: scripts/sft/evaluate_llm_as_judge.py
+      size: 2253
     outs:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Unsloth-Mistral-7B-Instruct-v0.3/metrics_997.json
-      hash: md5
-      md5: 533df4f640eb0699b5382cc759e0a45d
-      size: 310
-  evaluate@pl-court-instruct-Unsloth-Mistral-7B-Instruct-v0.3-fine-tuned-42:
-    cmd: PYTHONPATH=. python scripts/sft/evaluate.py --output-file 
-      data/experiments/predict/pl-court-instruct/Unsloth-Mistral-7B-Instruct-v0.3-fine-tuned/outputs_42.json
-      --num-proc=-1
+    - hash: md5
+      md5: de3f557dfdf3440262e4d8f811e526ca
+      path: data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/judge_metrics_997.json
+      size: 1167
+  evaluate_llm_as_judge_pl@gpt_4o_mini-Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned-42:
+    cmd: PYTHONPATH=. python scripts/sft/evaluate_llm_as_judge.py api_model=gpt_4o_mini
+      answers_file=data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned/outputs_42.json
+      out_metric_file=data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned/judge_metrics_42.json
+      prompt=pl
     deps:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Unsloth-Mistral-7B-Instruct-v0.3-fine-tuned/outputs_42.json
-      hash: md5
-      md5: 156091297490d893f9815d2ffcf17cbf
-      size: 1792160
-    - path: scripts/sft/evaluate.py
-      hash: md5
-      md5: 0644efb76af2c5461185e37a07ba2c17
-      size: 697
+    - hash: md5
+      md5: 14d4613f7d9495f5fb5f2d7b81f402a9
+      path: data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned/outputs_42.json
+      size: 1825646
+    - hash: md5
+      md5: 79a02fb864cb279f93fc4171043bb31c
+      path: scripts/sft/evaluate_llm_as_judge.py
+      size: 2253
     outs:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Unsloth-Mistral-7B-Instruct-v0.3-fine-tuned/metrics_42.json
-      hash: md5
-      md5: 89338c144457d5297d844c5f9b341f9f
-      size: 307
-  evaluate@pl-court-instruct-Unsloth-Mistral-7B-Instruct-v0.3-fine-tuned-7312:
-    cmd: PYTHONPATH=. python scripts/sft/evaluate.py --output-file 
-      data/experiments/predict/pl-court-instruct/Unsloth-Mistral-7B-Instruct-v0.3-fine-tuned/outputs_7312.json
-      --num-proc=-1
+    - hash: md5
+      md5: e8cff190991ee3164825dbf7eca03d12
+      path: data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned/judge_metrics_42.json
+      size: 1170
+  evaluate_llm_as_judge_pl@gpt_4o_mini-Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned-7312:
+    cmd: PYTHONPATH=. python scripts/sft/evaluate_llm_as_judge.py api_model=gpt_4o_mini
+      answers_file=data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned/outputs_7312.json
+      out_metric_file=data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned/judge_metrics_7312.json
+      prompt=pl
     deps:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Unsloth-Mistral-7B-Instruct-v0.3-fine-tuned/outputs_7312.json
-      hash: md5
-      md5: 3d336675e54a706fae45349adbaf6ee4
-      size: 1793461
-    - path: scripts/sft/evaluate.py
-      hash: md5
-      md5: 0644efb76af2c5461185e37a07ba2c17
-      size: 697
+    - hash: md5
+      md5: 302e1dc4f064007e3df88ac1e8acccc5
+      path: data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned/outputs_7312.json
+      size: 1831330
+    - hash: md5
+      md5: 79a02fb864cb279f93fc4171043bb31c
+      path: scripts/sft/evaluate_llm_as_judge.py
+      size: 2253
     outs:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Unsloth-Mistral-7B-Instruct-v0.3-fine-tuned/metrics_7312.json
-      hash: md5
-      md5: 8c6ac6f31de90a1fb08d73d08a8544dc
-      size: 305
-  evaluate@pl-court-instruct-Unsloth-Mistral-7B-Instruct-v0.3-fine-tuned-997:
-    cmd: PYTHONPATH=. python scripts/sft/evaluate.py --output-file 
-      data/experiments/predict/pl-court-instruct/Unsloth-Mistral-7B-Instruct-v0.3-fine-tuned/outputs_997.json
-      --num-proc=-1
+    - hash: md5
+      md5: aee4a08e0a4d0398b34a2587c039244d
+      path: data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned/judge_metrics_7312.json
+      size: 1169
+  evaluate_llm_as_judge_pl@gpt_4o_mini-Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned-997:
+    cmd: PYTHONPATH=. python scripts/sft/evaluate_llm_as_judge.py api_model=gpt_4o_mini
+      answers_file=data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned/outputs_997.json
+      out_metric_file=data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned/judge_metrics_997.json
+      prompt=pl
     deps:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Unsloth-Mistral-7B-Instruct-v0.3-fine-tuned/outputs_997.json
-      hash: md5
-      md5: 55d682fba1c08c68552e98be6b503b4e
-      size: 1790731
-    - path: scripts/sft/evaluate.py
-      hash: md5
-      md5: 0644efb76af2c5461185e37a07ba2c17
-      size: 697
+    - hash: md5
+      md5: 41a47dc56efc29b6c2771db68bdacb17
+      path: data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned/outputs_997.json
+      size: 1822491
+    - hash: md5
+      md5: 79a02fb864cb279f93fc4171043bb31c
+      path: scripts/sft/evaluate_llm_as_judge.py
+      size: 2253
     outs:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Unsloth-Mistral-7B-Instruct-v0.3-fine-tuned/metrics_997.json
-      hash: md5
-      md5: 9873757edce9412fdee1ef45513f26ac
-      size: 307
-  evaluate@pl-court-instruct-Unsloth-Mistral-Nemo-Instruct-2407-42:
-    cmd: PYTHONPATH=. python scripts/sft/evaluate.py --output-file 
-      data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/outputs_42.json
+    - hash: md5
+      md5: aac703269b10c85d1a2b5303c22ca077
+      path: data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned/judge_metrics_997.json
+      size: 1168
+  evaluate_pl@pl-court-instruct-Bielik-11B-v2.2-Instruct-42:
+    cmd: PYTHONPATH=. python scripts/sft/evaluate.py --output-file data/experiments/predict/pl-court-instruct/Bielik-11B-v2.2-Instruct/outputs_42.json
       --num-proc=-1
     deps:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/outputs_42.json
-      hash: md5
-      md5: 1385f49966e9db2a88a17f53d0887ad8
-      size: 1741944
-    - path: scripts/sft/evaluate.py
-      hash: md5
+    - hash: md5
+      md5: c3e404c898e3e193ac3aa910187b4f9f
+      path: data/experiments/predict/pl-court-instruct/Bielik-11B-v2.2-Instruct/outputs_42.json
+      size: 1734129
+    - hash: md5
       md5: 0644efb76af2c5461185e37a07ba2c17
+      path: scripts/sft/evaluate.py
       size: 697
     outs:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/metrics_42.json
-      hash: md5
-      md5: 37165fb96f31997589e95108f6c149a3
+    - hash: md5
+      md5: a75ab0f8f8238ab8c86397dd015fd31d
+      path: data/experiments/predict/pl-court-instruct/Bielik-11B-v2.2-Instruct/metrics_42.json
       size: 306
-  evaluate@pl-court-instruct-Unsloth-Mistral-Nemo-Instruct-2407-7312:
-    cmd: PYTHONPATH=. python scripts/sft/evaluate.py --output-file 
-      data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/outputs_7312.json
+  evaluate_pl@pl-court-instruct-Bielik-11B-v2.2-Instruct-7312:
+    cmd: PYTHONPATH=. python scripts/sft/evaluate.py --output-file data/experiments/predict/pl-court-instruct/Bielik-11B-v2.2-Instruct/outputs_7312.json
       --num-proc=-1
     deps:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/outputs_7312.json
-      hash: md5
-      md5: 924744efce1483e9128579cad7a4454c
-      size: 1748772
-    - path: scripts/sft/evaluate.py
-      hash: md5
+    - hash: md5
+      md5: d4a2ab2393a58f0d7e1897859eccb626
+      path: data/experiments/predict/pl-court-instruct/Bielik-11B-v2.2-Instruct/outputs_7312.json
+      size: 1734772
+    - hash: md5
       md5: 0644efb76af2c5461185e37a07ba2c17
+      path: scripts/sft/evaluate.py
       size: 697
     outs:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/metrics_7312.json
-      hash: md5
-      md5: 2ea9afa67cf34dc1629a265bce2c3357
-      size: 306
-  evaluate@pl-court-instruct-Unsloth-Mistral-Nemo-Instruct-2407-997:
-    cmd: PYTHONPATH=. python scripts/sft/evaluate.py --output-file 
-      data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/outputs_997.json
+    - hash: md5
+      md5: d5861dc30fca8f9bd2d311d924b3905d
+      path: data/experiments/predict/pl-court-instruct/Bielik-11B-v2.2-Instruct/metrics_7312.json
+      size: 305
+  evaluate_pl@pl-court-instruct-Bielik-11B-v2.2-Instruct-997:
+    cmd: PYTHONPATH=. python scripts/sft/evaluate.py --output-file data/experiments/predict/pl-court-instruct/Bielik-11B-v2.2-Instruct/outputs_997.json
       --num-proc=-1
     deps:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/outputs_997.json
-      hash: md5
-      md5: 4d023797a9053fd7df61f6b1796112e9
-      size: 1747404
-    - path: scripts/sft/evaluate.py
-      hash: md5
+    - hash: md5
+      md5: 8f4f6bc97e33b3b2728bebb7620a4968
+      path: data/experiments/predict/pl-court-instruct/Bielik-11B-v2.2-Instruct/outputs_997.json
+      size: 1731689
+    - hash: md5
       md5: 0644efb76af2c5461185e37a07ba2c17
+      path: scripts/sft/evaluate.py
       size: 697
     outs:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/metrics_997.json
-      hash: md5
-      md5: 122c6d789f343d4e7232d8720b0f577a
-      size: 303
-  evaluate@pl-court-instruct-Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned-42:
-    cmd: PYTHONPATH=. python scripts/sft/evaluate.py --output-file 
-      data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned/outputs_42.json
+    - hash: md5
+      md5: cd6699727392af2d61383b05fa962741
+      path: data/experiments/predict/pl-court-instruct/Bielik-11B-v2.2-Instruct/metrics_997.json
+      size: 306
+  evaluate_pl@pl-court-instruct-Bielik-11B-v2.2-Instruct-fine-tuned-42:
+    cmd: PYTHONPATH=. python scripts/sft/evaluate.py --output-file data/experiments/predict/pl-court-instruct/Bielik-11B-v2.2-Instruct-fine-tuned/outputs_42.json
       --num-proc=-1
     deps:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned/outputs_42.json
-      hash: md5
-      md5: 14d4613f7d9495f5fb5f2d7b81f402a9
-      size: 1825646
-    - path: scripts/sft/evaluate.py
-      hash: md5
+    - hash: md5
+      md5: dfd5d7389b312686428cc967aea5a5b9
+      path: data/experiments/predict/pl-court-instruct/Bielik-11B-v2.2-Instruct-fine-tuned/outputs_42.json
+      size: 1860743
+    - hash: md5
       md5: 0644efb76af2c5461185e37a07ba2c17
+      path: scripts/sft/evaluate.py
       size: 697
     outs:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned/metrics_42.json
-      hash: md5
-      md5: e43e3fe605787204ba1345dedaefd124
-      size: 305
-  evaluate@pl-court-instruct-Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned-7312:
-    cmd: PYTHONPATH=. python scripts/sft/evaluate.py --output-file 
-      data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned/outputs_7312.json
+    - hash: md5
+      md5: d1462bb74d1f8790270a5d97c674891c
+      path: data/experiments/predict/pl-court-instruct/Bielik-11B-v2.2-Instruct-fine-tuned/metrics_42.json
+      size: 304
+  evaluate_pl@pl-court-instruct-Bielik-11B-v2.2-Instruct-fine-tuned-7312:
+    cmd: PYTHONPATH=. python scripts/sft/evaluate.py --output-file data/experiments/predict/pl-court-instruct/Bielik-11B-v2.2-Instruct-fine-tuned/outputs_7312.json
       --num-proc=-1
     deps:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned/outputs_7312.json
-      hash: md5
-      md5: 302e1dc4f064007e3df88ac1e8acccc5
-      size: 1831330
-    - path: scripts/sft/evaluate.py
-      hash: md5
+    - hash: md5
+      md5: 8fa2faeda5a577c06cd6bf35b8702330
+      path: data/experiments/predict/pl-court-instruct/Bielik-11B-v2.2-Instruct-fine-tuned/outputs_7312.json
+      size: 1857569
+    - hash: md5
       md5: 0644efb76af2c5461185e37a07ba2c17
+      path: scripts/sft/evaluate.py
       size: 697
     outs:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned/metrics_7312.json
-      hash: md5
-      md5: b5d0fa11fe5557bbe1ee9804d5d09cb1
-      size: 303
-  evaluate@pl-court-instruct-Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned-997:
-    cmd: PYTHONPATH=. python scripts/sft/evaluate.py --output-file 
-      data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned/outputs_997.json
+    - hash: md5
+      md5: 5edacea1e40b97765c7eaa7b4991ab16
+      path: data/experiments/predict/pl-court-instruct/Bielik-11B-v2.2-Instruct-fine-tuned/metrics_7312.json
+      size: 306
+  evaluate_pl@pl-court-instruct-Bielik-11B-v2.2-Instruct-fine-tuned-997:
+    cmd: PYTHONPATH=. python scripts/sft/evaluate.py --output-file data/experiments/predict/pl-court-instruct/Bielik-11B-v2.2-Instruct-fine-tuned/outputs_997.json
       --num-proc=-1
     deps:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned/outputs_997.json
-      hash: md5
-      md5: 41a47dc56efc29b6c2771db68bdacb17
-      size: 1822491
-    - path: scripts/sft/evaluate.py
-      hash: md5
+    - hash: md5
+      md5: ba53d76f701eddb60a182de49d992878
+      path: data/experiments/predict/pl-court-instruct/Bielik-11B-v2.2-Instruct-fine-tuned/outputs_997.json
+      size: 1857855
+    - hash: md5
       md5: 0644efb76af2c5461185e37a07ba2c17
+      path: scripts/sft/evaluate.py
       size: 697
     outs:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned/metrics_997.json
-      hash: md5
-      md5: 403fc36606ced0ab31d34d3d548f948e
-      size: 303
-  evaluate@pl-court-instruct-Bielik-7B-Instruct-v0.1-42:
-    cmd: PYTHONPATH=. python scripts/sft/evaluate.py --output-file 
-      data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1/outputs_42.json
+    - hash: md5
+      md5: 84fbcf83da746f9e98f70ab22be6f238
+      path: data/experiments/predict/pl-court-instruct/Bielik-11B-v2.2-Instruct-fine-tuned/metrics_997.json
+      size: 304
+  evaluate_pl@pl-court-instruct-Bielik-7B-Instruct-v0.1-42:
+    cmd: PYTHONPATH=. python scripts/sft/evaluate.py --output-file data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1/outputs_42.json
       --num-proc=-1
     deps:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1/outputs_42.json
-      hash: md5
+    - hash: md5
       md5: 2dc39513a04910c5d0c54380166639d9
+      path: data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1/outputs_42.json
       size: 2029644
-    - path: scripts/sft/evaluate.py
-      hash: md5
+    - hash: md5
       md5: 0644efb76af2c5461185e37a07ba2c17
+      path: scripts/sft/evaluate.py
       size: 697
     outs:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1/metrics_42.json
-      hash: md5
+    - hash: md5
       md5: 2cbca38fd0bbdb4df024f76506eeb26c
+      path: data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1/metrics_42.json
       size: 307
-  evaluate@pl-court-instruct-Bielik-7B-Instruct-v0.1-7312:
-    cmd: PYTHONPATH=. python scripts/sft/evaluate.py --output-file 
-      data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1/outputs_7312.json
+  evaluate_pl@pl-court-instruct-Bielik-7B-Instruct-v0.1-7312:
+    cmd: PYTHONPATH=. python scripts/sft/evaluate.py --output-file data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1/outputs_7312.json
       --num-proc=-1
     deps:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1/outputs_7312.json
-      hash: md5
+    - hash: md5
       md5: ae39bf31296ffe82c0f6a3e8c9ff63aa
+      path: data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1/outputs_7312.json
       size: 2014399
-    - path: scripts/sft/evaluate.py
-      hash: md5
+    - hash: md5
       md5: 0644efb76af2c5461185e37a07ba2c17
+      path: scripts/sft/evaluate.py
       size: 697
     outs:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1/metrics_7312.json
-      hash: md5
+    - hash: md5
       md5: ad13d47ca88e721be75c79c225e12ee6
+      path: data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1/metrics_7312.json
       size: 289
-  evaluate@pl-court-instruct-Bielik-7B-Instruct-v0.1-997:
-    cmd: PYTHONPATH=. python scripts/sft/evaluate.py --output-file 
-      data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1/outputs_997.json
+  evaluate_pl@pl-court-instruct-Bielik-7B-Instruct-v0.1-997:
+    cmd: PYTHONPATH=. python scripts/sft/evaluate.py --output-file data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1/outputs_997.json
       --num-proc=-1
     deps:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1/outputs_997.json
-      hash: md5
+    - hash: md5
       md5: fac04d78ad020b50f79fc7277a037e8e
+      path: data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1/outputs_997.json
       size: 2016400
-    - path: scripts/sft/evaluate.py
-      hash: md5
+    - hash: md5
       md5: 0644efb76af2c5461185e37a07ba2c17
+      path: scripts/sft/evaluate.py
       size: 697
     outs:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1/metrics_997.json
-      hash: md5
+    - hash: md5
       md5: 83fb160145ef5e21b43f7c348658ea02
+      path: data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1/metrics_997.json
       size: 327
-  evaluate@pl-court-instruct-Bielik-7B-Instruct-v0.1-fine-tuned-42:
-    cmd: PYTHONPATH=. python scripts/sft/evaluate.py --output-file 
-      data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1-fine-tuned/outputs_42.json
+  evaluate_pl@pl-court-instruct-Bielik-7B-Instruct-v0.1-fine-tuned-42:
+    cmd: PYTHONPATH=. python scripts/sft/evaluate.py --output-file data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1-fine-tuned/outputs_42.json
       --num-proc=-1
     deps:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1-fine-tuned/outputs_42.json
-      hash: md5
+    - hash: md5
       md5: 178eb0649617d4a698da6c9e315e84c5
+      path: data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1-fine-tuned/outputs_42.json
       size: 2034749
-    - path: scripts/sft/evaluate.py
-      hash: md5
+    - hash: md5
       md5: 0644efb76af2c5461185e37a07ba2c17
+      path: scripts/sft/evaluate.py
       size: 697
     outs:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1-fine-tuned/metrics_42.json
-      hash: md5
+    - hash: md5
       md5: 99e684c720ca4c4ef6c4276e7d1880ab
+      path: data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1-fine-tuned/metrics_42.json
       size: 305
-  evaluate@pl-court-instruct-Bielik-7B-Instruct-v0.1-fine-tuned-7312:
-    cmd: PYTHONPATH=. python scripts/sft/evaluate.py --output-file 
-      data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1-fine-tuned/outputs_7312.json
+  evaluate_pl@pl-court-instruct-Bielik-7B-Instruct-v0.1-fine-tuned-7312:
+    cmd: PYTHONPATH=. python scripts/sft/evaluate.py --output-file data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1-fine-tuned/outputs_7312.json
       --num-proc=-1
     deps:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1-fine-tuned/outputs_7312.json
-      hash: md5
+    - hash: md5
       md5: 743ea22448bc73a7a991da075fca8841
+      path: data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1-fine-tuned/outputs_7312.json
       size: 2031343
-    - path: scripts/sft/evaluate.py
-      hash: md5
+    - hash: md5
       md5: 0644efb76af2c5461185e37a07ba2c17
+      path: scripts/sft/evaluate.py
       size: 697
     outs:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1-fine-tuned/metrics_7312.json
-      hash: md5
+    - hash: md5
       md5: 07d798079cedf3dc194242d6a1bc3bcd
+      path: data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1-fine-tuned/metrics_7312.json
       size: 306
-  evaluate@pl-court-instruct-Bielik-7B-Instruct-v0.1-fine-tuned-997:
-    cmd: PYTHONPATH=. python scripts/sft/evaluate.py --output-file 
-      data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1-fine-tuned/outputs_997.json
+  evaluate_pl@pl-court-instruct-Bielik-7B-Instruct-v0.1-fine-tuned-997:
+    cmd: PYTHONPATH=. python scripts/sft/evaluate.py --output-file data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1-fine-tuned/outputs_997.json
       --num-proc=-1
     deps:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1-fine-tuned/outputs_997.json
-      hash: md5
+    - hash: md5
       md5: 433a4b2aa7870a134277a265d099a588
+      path: data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1-fine-tuned/outputs_997.json
       size: 2029482
-    - path: scripts/sft/evaluate.py
-      hash: md5
+    - hash: md5
       md5: 0644efb76af2c5461185e37a07ba2c17
+      path: scripts/sft/evaluate.py
       size: 697
     outs:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1-fine-tuned/metrics_997.json
-      hash: md5
+    - hash: md5
       md5: adb7c1e239396bbf6e308f3f1b436099
+      path: data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1-fine-tuned/metrics_997.json
       size: 307
-  evaluate@pl-court-instruct-trurl-13B-academic-42:
-    cmd: PYTHONPATH=. python scripts/sft/evaluate.py --output-file data/experiments/predict/pl-court-instruct/trurl-13B-academic/outputs_42.json
+  evaluate_pl@pl-court-instruct-Unsloth-Llama-3-8B-Instruct-42:
+    cmd: PYTHONPATH=. python scripts/sft/evaluate.py --output-file data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct/outputs_42.json
       --num-proc=-1
     deps:
-    - path: data/experiments/predict/pl-court-instruct/trurl-13B-academic/outputs_42.json
-      hash: md5
-      md5: bb571102170940efc73f02143a530d5b
-      size: 1289839
-    - path: scripts/sft/evaluate.py
-      hash: md5
+    - hash: md5
+      md5: e99c88720116c951087b6125e5f4be4d
+      path: data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct/outputs_42.json
+      size: 2008073
+    - hash: md5
       md5: 0644efb76af2c5461185e37a07ba2c17
+      path: scripts/sft/evaluate.py
       size: 697
     outs:
-    - path: data/experiments/predict/pl-court-instruct/trurl-13B-academic/metrics_42.json
-      hash: md5
-      md5: 5bb8bd6918ec3d91437d3465cbbee127
-      size: 311
-  evaluate@pl-court-instruct-trurl-13B-academic-7312:
-    cmd: PYTHONPATH=. python scripts/sft/evaluate.py --output-file data/experiments/predict/pl-court-instruct/trurl-13B-academic/outputs_7312.json
+    - hash: md5
+      md5: 2116481b79c785f94b35852b6e0e4f57
+      path: data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct/metrics_42.json
+      size: 304
+  evaluate_pl@pl-court-instruct-Unsloth-Llama-3-8B-Instruct-7312:
+    cmd: PYTHONPATH=. python scripts/sft/evaluate.py --output-file data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct/outputs_7312.json
       --num-proc=-1
     deps:
-    - path: data/experiments/predict/pl-court-instruct/trurl-13B-academic/outputs_7312.json
-      hash: md5
-      md5: bcd41ca4629d4cec2440a8ed2f02560f
-      size: 1283974
-    - path: scripts/sft/evaluate.py
-      hash: md5
+    - hash: md5
+      md5: 4c25368aacb7402b1b2cae9368d187d1
+      path: data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct/outputs_7312.json
+      size: 2013637
+    - hash: md5
       md5: 0644efb76af2c5461185e37a07ba2c17
+      path: scripts/sft/evaluate.py
       size: 697
     outs:
-    - path: data/experiments/predict/pl-court-instruct/trurl-13B-academic/metrics_7312.json
-      hash: md5
-      md5: da990070981d0524aa31916fad80e0eb
-      size: 313
-  evaluate@pl-court-instruct-trurl-13B-academic-997:
-    cmd: PYTHONPATH=. python scripts/sft/evaluate.py --output-file data/experiments/predict/pl-court-instruct/trurl-13B-academic/outputs_997.json
+    - hash: md5
+      md5: 5e851a38b322caff59de90004eb4a075
+      path: data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct/metrics_7312.json
+      size: 305
+  evaluate_pl@pl-court-instruct-Unsloth-Llama-3-8B-Instruct-997:
+    cmd: PYTHONPATH=. python scripts/sft/evaluate.py --output-file data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct/outputs_997.json
       --num-proc=-1
     deps:
-    - path: data/experiments/predict/pl-court-instruct/trurl-13B-academic/outputs_997.json
-      hash: md5
-      md5: 731cff0eb1484682de211336efeff153
-      size: 1288941
-    - path: scripts/sft/evaluate.py
-      hash: md5
+    - hash: md5
+      md5: baef589507248af212aaae51602fd999
+      path: data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct/outputs_997.json
+      size: 2010150
+    - hash: md5
       md5: 0644efb76af2c5461185e37a07ba2c17
+      path: scripts/sft/evaluate.py
       size: 697
     outs:
-    - path: data/experiments/predict/pl-court-instruct/trurl-13B-academic/metrics_997.json
-      hash: md5
-      md5: 4d080092735994614eaf7125e5fe5bf2
-      size: 313
-  evaluate@en-court-instruct-Unsloth-Llama-3-8B-Instruct-42:
-    cmd: PYTHONPATH=. python scripts/sft/evaluate.py --output-file 
-      data/experiments/predict/en-court-instruct/Unsloth-Llama-3-8B-Instruct/outputs_42.json
-    deps:
-    - path: 
-        data/experiments/predict/en-court-instruct/Unsloth-Llama-3-8B-Instruct/outputs_42.json
-      hash: md5
-      md5: 761018c0a306fbee63dad2fbc119110d
-      size: 821683
-    - path: scripts/sft/evaluate.py
-      hash: md5
-      md5: 73aa4a7eb8a035c087702457b9401654
-      size: 636
-    outs:
-    - path: 
-        data/experiments/predict/en-court-instruct/Unsloth-Llama-3-8B-Instruct/metrics_42.json
-      hash: md5
-      md5: 265776ba10a7b24b66e6bac1131e0c48
-      size: 149
-  evaluate@en-court-instruct-Unsloth-Llama-3-8B-Instruct-7312:
-    cmd: PYTHONPATH=. python scripts/sft/evaluate.py --output-file 
-      data/experiments/predict/en-court-instruct/Unsloth-Llama-3-8B-Instruct/outputs_7312.json
-    deps:
-    - path: 
-        data/experiments/predict/en-court-instruct/Unsloth-Llama-3-8B-Instruct/outputs_7312.json
-      hash: md5
-      md5: a7361535b440251d6ce6232a15cfcdf2
-      size: 818877
-    - path: scripts/sft/evaluate.py
-      hash: md5
-      md5: 73aa4a7eb8a035c087702457b9401654
-      size: 636
-    outs:
-    - path: 
-        data/experiments/predict/en-court-instruct/Unsloth-Llama-3-8B-Instruct/metrics_7312.json
-      hash: md5
-      md5: 97fa8dfaa5e57633e8fb6a7d073177f5
-      size: 147
-  evaluate@en-court-instruct-Unsloth-Llama-3-8B-Instruct-997:
-    cmd: PYTHONPATH=. python scripts/sft/evaluate.py --output-file 
-      data/experiments/predict/en-court-instruct/Unsloth-Llama-3-8B-Instruct/outputs_997.json
-    deps:
-    - path: 
-        data/experiments/predict/en-court-instruct/Unsloth-Llama-3-8B-Instruct/outputs_997.json
-      hash: md5
-      md5: 94924275d576271875fecf22c0f9b39e
-      size: 817490
-    - path: scripts/sft/evaluate.py
-      hash: md5
-      md5: 73aa4a7eb8a035c087702457b9401654
-      size: 636
-    outs:
-    - path: 
-        data/experiments/predict/en-court-instruct/Unsloth-Llama-3-8B-Instruct/metrics_997.json
-      hash: md5
-      md5: c3552161ec68d8cc6a8e5b75f02e22e2
-      size: 147
-  evaluate@en-court-instruct-Unsloth-Mistral-Nemo-Instruct-2407-42:
-    cmd: PYTHONPATH=. python scripts/sft/evaluate.py --output-file 
-      data/experiments/predict/en-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/outputs_42.json
-    deps:
-    - path: 
-        data/experiments/predict/en-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/outputs_42.json
-      hash: md5
-      md5: 4fe25ad80a20ea5d6200136176b3e4ca
-      size: 705218
-    - path: scripts/sft/evaluate.py
-      hash: md5
-      md5: 73aa4a7eb8a035c087702457b9401654
-      size: 636
-    outs:
-    - path: 
-        data/experiments/predict/en-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/metrics_42.json
-      hash: md5
-      md5: 0b2f663a1cbc3ef08c363ec8adc53c15
-      size: 151
-  evaluate@en-court-instruct-Unsloth-Mistral-Nemo-Instruct-2407-7312:
-    cmd: PYTHONPATH=. python scripts/sft/evaluate.py --output-file 
-      data/experiments/predict/en-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/outputs_7312.json
-    deps:
-    - path: 
-        data/experiments/predict/en-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/outputs_7312.json
-      hash: md5
-      md5: cf4fdbf0e26e6c793bdca4edd6e365c0
-      size: 703876
-    - path: scripts/sft/evaluate.py
-      hash: md5
-      md5: 73aa4a7eb8a035c087702457b9401654
-      size: 636
-    outs:
-    - path: 
-        data/experiments/predict/en-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/metrics_7312.json
-      hash: md5
-      md5: 604b5cee14ec6520b88bafecc962e031
-      size: 152
-  evaluate@en-court-instruct-Unsloth-Mistral-Nemo-Instruct-2407-997:
-    cmd: PYTHONPATH=. python scripts/sft/evaluate.py --output-file 
-      data/experiments/predict/en-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/outputs_997.json
-    deps:
-    - path: 
-        data/experiments/predict/en-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/outputs_997.json
-      hash: md5
-      md5: 94c30cf8fe7db71afc58a5c9cdbc0d9f
-      size: 705894
-    - path: scripts/sft/evaluate.py
-      hash: md5
-      md5: 73aa4a7eb8a035c087702457b9401654
-      size: 636
-    outs:
-    - path: 
-        data/experiments/predict/en-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/metrics_997.json
-      hash: md5
-      md5: a91ec5b434bebd8ce1d2000e0a033cb9
-      size: 152
-  sft_unsloth@en-court-instruct-Unsloth-Llama-3-8B-Instruct:
-    cmd: PYTHONPATH=. python scripts/sft/fine_tune_llm.py dataset=en-court-instruct
-      model=Unsloth-Llama-3-8B-Instruct
-    deps:
-    - path: configs/fine_tuning.yaml
-      hash: md5
-      md5: 3933c4faf5a478d0f9d3963c3b29e5cc
-      size: 1356
-    - path: configs/model/Unsloth-Llama-3-8B-Instruct.yaml
-      hash: md5
-      md5: 56a95874b3e77e7ffec11c00330da5b6
-      size: 176
-    - path: scripts/sft/fine_tune_llm.py
-      hash: md5
-      md5: 4b77ee1ea604cae18f17ca00cdb6988b
-      size: 4578
-    outs:
-    - path: data/experiments/fine-tune/Unsloth-Llama-3-8B-Instruct/en-court-instruct/
-      hash: md5
-      md5: c99c2a68274325db86fbbd41bcc30e78.dir
-      size: 354395477
-      nfiles: 18
-  predict@pl-court-instruct-qra-13b-997:
-    cmd: PYTHONPATH=. python scripts/sft/predict.py dataset=pl-court-instruct model=qra-13b
-      random_seed=997 output_file=data/experiments/predict/pl-court-instruct/qra-13b/outputs_997.json
-    deps:
-    - path: configs/model/qra-13b.yaml
-      hash: md5
-      md5: ab2baba7b6109364d7e04c77232b0f9d
-      size: 152
-    - path: configs/predict.yaml
-      hash: md5
-      md5: 5fc8b9ac571d4a2209d7d866697252ab
-      size: 402
-    - path: scripts/sft/predict.py
-      hash: md5
-      md5: f9acd63cd4d682ae2242d7b51f0d974b
-      size: 3198
-    outs:
-    - path: data/experiments/predict/pl-court-instruct/qra-13b/outputs_997.json
-      hash: md5
-      md5: 72ef8a411b8f5aeb006c99e5868c754d
-      size: 2252480
-  predict_en@en-court-instruct-Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned-en-7312:
-    cmd: PYTHONPATH=. python scripts/sft/predict.py dataset=en-court-instruct model=Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned-en
-      random_seed=7312 
-      output_file=data/experiments/predict/en-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned-en/outputs_7312.json
-    deps:
-    - path: configs/model/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned-en.yaml
-      hash: md5
-      md5: 5986ff103292733bff4662585ae5d860
-      size: 351
-    - path: configs/predict.yaml
-      hash: md5
-      md5: 5fc8b9ac571d4a2209d7d866697252ab
-      size: 402
-    - path: scripts/sft/predict.py
-      hash: md5
-      md5: f9acd63cd4d682ae2242d7b51f0d974b
-      size: 3198
-    outs:
-    - path: 
-        data/experiments/predict/en-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned-en/outputs_7312.json
-      hash: md5
-      md5: 4ed8db93aa14f1cc98e276d3989efa9e
-      size: 642730
-  predict_en@en-court-instruct-Unsloth-Llama-3-8B-Instruct-fine-tuned-en-997:
-    cmd: PYTHONPATH=. python scripts/sft/predict.py dataset=en-court-instruct model=Unsloth-Llama-3-8B-Instruct-fine-tuned-en
-      random_seed=997 
-      output_file=data/experiments/predict/en-court-instruct/Unsloth-Llama-3-8B-Instruct-fine-tuned-en/outputs_997.json
-    deps:
-    - path: configs/model/Unsloth-Llama-3-8B-Instruct-fine-tuned-en.yaml
-      hash: md5
-      md5: 67faffd57ec54260d70eb3a89d2ec130
-      size: 259
-    - path: configs/predict.yaml
-      hash: md5
-      md5: 5fc8b9ac571d4a2209d7d866697252ab
-      size: 402
-    - path: scripts/sft/predict.py
-      hash: md5
-      md5: f9acd63cd4d682ae2242d7b51f0d974b
-      size: 3198
-    outs:
-    - path: 
-        data/experiments/predict/en-court-instruct/Unsloth-Llama-3-8B-Instruct-fine-tuned-en/outputs_997.json
-      hash: md5
-      md5: 4e968cac351ad48ad786d1ecccbbc967
-      size: 670674
-  predict_en@en-court-instruct-Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned-en-997:
-    cmd: PYTHONPATH=. python scripts/sft/predict.py dataset=en-court-instruct model=Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned-en
-      random_seed=997 
-      output_file=data/experiments/predict/en-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned-en/outputs_997.json
-    deps:
-    - path: configs/model/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned-en.yaml
-      hash: md5
-      md5: 5986ff103292733bff4662585ae5d860
-      size: 351
-    - path: configs/predict.yaml
-      hash: md5
-      md5: 5fc8b9ac571d4a2209d7d866697252ab
-      size: 402
-    - path: scripts/sft/predict.py
-      hash: md5
-      md5: f9acd63cd4d682ae2242d7b51f0d974b
-      size: 3198
-    outs:
-    - path: 
-        data/experiments/predict/en-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned-en/outputs_997.json
-      hash: md5
-      md5: 787c129090aa1b64e337b236a4391402
-      size: 642477
-  predict@pl-court-instruct-qra-13b-42:
-    cmd: PYTHONPATH=. python scripts/sft/predict.py dataset=pl-court-instruct model=qra-13b
-      random_seed=42 output_file=data/experiments/predict/pl-court-instruct/qra-13b/outputs_42.json
-    deps:
-    - path: configs/model/qra-13b.yaml
-      hash: md5
-      md5: ab2baba7b6109364d7e04c77232b0f9d
-      size: 152
-    - path: configs/predict.yaml
-      hash: md5
-      md5: 5fc8b9ac571d4a2209d7d866697252ab
-      size: 402
-    - path: scripts/sft/predict.py
-      hash: md5
-      md5: f9acd63cd4d682ae2242d7b51f0d974b
-      size: 3198
-    outs:
-    - path: data/experiments/predict/pl-court-instruct/qra-13b/outputs_42.json
-      hash: md5
-      md5: dd142d2d1c24c499bbe615bf4b74525c
-      size: 2247396
-  predict_en@en-court-instruct-Unsloth-Llama-3-8B-Instruct-fine-tuned-en-7312:
-    cmd: PYTHONPATH=. python scripts/sft/predict.py dataset=en-court-instruct model=Unsloth-Llama-3-8B-Instruct-fine-tuned-en
-      random_seed=7312 
-      output_file=data/experiments/predict/en-court-instruct/Unsloth-Llama-3-8B-Instruct-fine-tuned-en/outputs_7312.json
-    deps:
-    - path: configs/model/Unsloth-Llama-3-8B-Instruct-fine-tuned-en.yaml
-      hash: md5
-      md5: 67faffd57ec54260d70eb3a89d2ec130
-      size: 259
-    - path: configs/predict.yaml
-      hash: md5
-      md5: 5fc8b9ac571d4a2209d7d866697252ab
-      size: 402
-    - path: scripts/sft/predict.py
-      hash: md5
-      md5: f9acd63cd4d682ae2242d7b51f0d974b
-      size: 3198
-    outs:
-    - path: 
-        data/experiments/predict/en-court-instruct/Unsloth-Llama-3-8B-Instruct-fine-tuned-en/outputs_7312.json
-      hash: md5
-      md5: f0b806eebca2f3ddf49d0ff821856b45
-      size: 670935
-  predict_en@en-court-instruct-Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned-en-42:
-    cmd: PYTHONPATH=. python scripts/sft/predict.py dataset=en-court-instruct model=Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned-en
-      random_seed=42 
-      output_file=data/experiments/predict/en-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned-en/outputs_42.json
-    deps:
-    - path: configs/model/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned-en.yaml
-      hash: md5
-      md5: 5986ff103292733bff4662585ae5d860
-      size: 351
-    - path: configs/predict.yaml
-      hash: md5
-      md5: 5fc8b9ac571d4a2209d7d866697252ab
-      size: 402
-    - path: scripts/sft/predict.py
-      hash: md5
-      md5: f9acd63cd4d682ae2242d7b51f0d974b
-      size: 3198
-    outs:
-    - path: 
-        data/experiments/predict/en-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned-en/outputs_42.json
-      hash: md5
-      md5: 313fa5a662f37cacae4980a04830f422
-      size: 642688
-  predict_en@en-court-instruct-Unsloth-Llama-3-8B-Instruct-fine-tuned-en-42:
-    cmd: PYTHONPATH=. python scripts/sft/predict.py dataset=en-court-instruct model=Unsloth-Llama-3-8B-Instruct-fine-tuned-en
-      random_seed=42 
-      output_file=data/experiments/predict/en-court-instruct/Unsloth-Llama-3-8B-Instruct-fine-tuned-en/outputs_42.json
+    - hash: md5
+      md5: 8c3af9851700f2ff640dd9c8dc92b06d
+      path: data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct/metrics_997.json
+      size: 307
+  evaluate_pl@pl-court-instruct-Unsloth-Llama-3-8B-Instruct-fine-tuned-42:
+    cmd: PYTHONPATH=. python scripts/sft/evaluate.py --output-file data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct-fine-tuned/outputs_42.json
+      --num-proc=-1
     deps:
-    - path: configs/model/Unsloth-Llama-3-8B-Instruct-fine-tuned-en.yaml
-      hash: md5
-      md5: 67faffd57ec54260d70eb3a89d2ec130
-      size: 259
-    - path: configs/predict.yaml
-      hash: md5
-      md5: 5fc8b9ac571d4a2209d7d866697252ab
-      size: 402
-    - path: scripts/sft/predict.py
-      hash: md5
-      md5: f9acd63cd4d682ae2242d7b51f0d974b
-      size: 3198
+    - hash: md5
+      md5: 289b719e8c7166e578417e5706bdc4e3
+      path: data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct-fine-tuned/outputs_42.json
+      size: 1760355
+    - hash: md5
+      md5: 0644efb76af2c5461185e37a07ba2c17
+      path: scripts/sft/evaluate.py
+      size: 697
     outs:
-    - path: 
-        data/experiments/predict/en-court-instruct/Unsloth-Llama-3-8B-Instruct-fine-tuned-en/outputs_42.json
-      hash: md5
-      md5: 4246a4fafba5e130aac3db6c1c61ce30
-      size: 675578
-  predict@pl-court-instruct-qra-13b-7312:
-    cmd: PYTHONPATH=. python scripts/sft/predict.py dataset=pl-court-instruct model=qra-13b
-      random_seed=7312 output_file=data/experiments/predict/pl-court-instruct/qra-13b/outputs_7312.json
+    - hash: md5
+      md5: 35fc5163dfb37097b814afcc79e91074
+      path: data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct-fine-tuned/metrics_42.json
+      size: 304
+  evaluate_pl@pl-court-instruct-Unsloth-Llama-3-8B-Instruct-fine-tuned-7312:
+    cmd: PYTHONPATH=. python scripts/sft/evaluate.py --output-file data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct-fine-tuned/outputs_7312.json
+      --num-proc=-1
     deps:
-    - path: configs/model/qra-13b.yaml
-      hash: md5
-      md5: ab2baba7b6109364d7e04c77232b0f9d
-      size: 152
-    - path: configs/predict.yaml
-      hash: md5
-      md5: 5fc8b9ac571d4a2209d7d866697252ab
-      size: 402
-    - path: scripts/sft/predict.py
-      hash: md5
-      md5: f9acd63cd4d682ae2242d7b51f0d974b
-      size: 3198
+    - hash: md5
+      md5: 25bee3b4ee09b36d636095b4c927a0d3
+      path: data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct-fine-tuned/outputs_7312.json
+      size: 1759194
+    - hash: md5
+      md5: 0644efb76af2c5461185e37a07ba2c17
+      path: scripts/sft/evaluate.py
+      size: 697
     outs:
-    - path: data/experiments/predict/pl-court-instruct/qra-13b/outputs_7312.json
-      hash: md5
-      md5: fddb307b29b598df3786fc94d479e918
-      size: 2254243
-  evaluate_en@en-court-instruct-Unsloth-Llama-3-8B-Instruct-42:
-    cmd: PYTHONPATH=. python scripts/sft/evaluate.py --output-file 
-      data/experiments/predict/en-court-instruct/Unsloth-Llama-3-8B-Instruct/outputs_42.json
+    - hash: md5
+      md5: f7c13c964cc9e225fa794935cbf6515c
+      path: data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct-fine-tuned/metrics_7312.json
+      size: 305
+  evaluate_pl@pl-court-instruct-Unsloth-Llama-3-8B-Instruct-fine-tuned-997:
+    cmd: PYTHONPATH=. python scripts/sft/evaluate.py --output-file data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct-fine-tuned/outputs_997.json
       --num-proc=-1
     deps:
-    - path: 
-        data/experiments/predict/en-court-instruct/Unsloth-Llama-3-8B-Instruct/outputs_42.json
-      hash: md5
-      md5: 761018c0a306fbee63dad2fbc119110d
-      size: 821683
-    - path: scripts/sft/evaluate.py
-      hash: md5
+    - hash: md5
+      md5: 82b2c535d99d91b9a34986375bfa31a9
+      path: data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct-fine-tuned/outputs_997.json
+      size: 1758747
+    - hash: md5
       md5: 0644efb76af2c5461185e37a07ba2c17
+      path: scripts/sft/evaluate.py
       size: 697
     outs:
-    - path: 
-        data/experiments/predict/en-court-instruct/Unsloth-Llama-3-8B-Instruct/metrics_42.json
-      hash: md5
-      md5: 265776ba10a7b24b66e6bac1131e0c48
-      size: 149
-  evaluate_en@en-court-instruct-Unsloth-Llama-3-8B-Instruct-7312:
-    cmd: PYTHONPATH=. python scripts/sft/evaluate.py --output-file 
-      data/experiments/predict/en-court-instruct/Unsloth-Llama-3-8B-Instruct/outputs_7312.json
+    - hash: md5
+      md5: 68f0244a7871bae1e8bd0642a0f2c22e
+      path: data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct-fine-tuned/metrics_997.json
+      size: 305
+  evaluate_pl@pl-court-instruct-Unsloth-Mistral-7B-Instruct-v0.3-42:
+    cmd: PYTHONPATH=. python scripts/sft/evaluate.py --output-file data/experiments/predict/pl-court-instruct/Unsloth-Mistral-7B-Instruct-v0.3/outputs_42.json
       --num-proc=-1
     deps:
-    - path: 
-        data/experiments/predict/en-court-instruct/Unsloth-Llama-3-8B-Instruct/outputs_7312.json
-      hash: md5
-      md5: a7361535b440251d6ce6232a15cfcdf2
-      size: 818877
-    - path: scripts/sft/evaluate.py
-      hash: md5
+    - hash: md5
+      md5: 4a9d3a2bb1dd47a732bd2df8102bc93f
+      path: data/experiments/predict/pl-court-instruct/Unsloth-Mistral-7B-Instruct-v0.3/outputs_42.json
+      size: 1799957
+    - hash: md5
       md5: 0644efb76af2c5461185e37a07ba2c17
+      path: scripts/sft/evaluate.py
       size: 697
     outs:
-    - path: 
-        data/experiments/predict/en-court-instruct/Unsloth-Llama-3-8B-Instruct/metrics_7312.json
-      hash: md5
-      md5: 97fa8dfaa5e57633e8fb6a7d073177f5
-      size: 147
-  evaluate_en@en-court-instruct-Unsloth-Llama-3-8B-Instruct-997:
-    cmd: PYTHONPATH=. python scripts/sft/evaluate.py --output-file 
-      data/experiments/predict/en-court-instruct/Unsloth-Llama-3-8B-Instruct/outputs_997.json
+    - hash: md5
+      md5: 703e92a1c58aca701b128fd28f4697a4
+      path: data/experiments/predict/pl-court-instruct/Unsloth-Mistral-7B-Instruct-v0.3/metrics_42.json
+      size: 306
+  evaluate_pl@pl-court-instruct-Unsloth-Mistral-7B-Instruct-v0.3-7312:
+    cmd: PYTHONPATH=. python scripts/sft/evaluate.py --output-file data/experiments/predict/pl-court-instruct/Unsloth-Mistral-7B-Instruct-v0.3/outputs_7312.json
       --num-proc=-1
     deps:
-    - path: 
-        data/experiments/predict/en-court-instruct/Unsloth-Llama-3-8B-Instruct/outputs_997.json
-      hash: md5
-      md5: 94924275d576271875fecf22c0f9b39e
-      size: 817490
-    - path: scripts/sft/evaluate.py
-      hash: md5
+    - hash: md5
+      md5: 77e10dd2ec17e12e171e4bcab1a48e08
+      path: data/experiments/predict/pl-court-instruct/Unsloth-Mistral-7B-Instruct-v0.3/outputs_7312.json
+      size: 1795629
+    - hash: md5
       md5: 0644efb76af2c5461185e37a07ba2c17
+      path: scripts/sft/evaluate.py
       size: 697
     outs:
-    - path: 
-        data/experiments/predict/en-court-instruct/Unsloth-Llama-3-8B-Instruct/metrics_997.json
-      hash: md5
-      md5: c3552161ec68d8cc6a8e5b75f02e22e2
-      size: 147
-  evaluate_en@en-court-instruct-Unsloth-Llama-3-8B-Instruct-fine-tuned-en-42:
-    cmd: PYTHONPATH=. python scripts/sft/evaluate.py --output-file 
-      data/experiments/predict/en-court-instruct/Unsloth-Llama-3-8B-Instruct-fine-tuned-en/outputs_42.json
+    - hash: md5
+      md5: 6f3c13385fefb9e38f01a42bb210e3f5
+      path: data/experiments/predict/pl-court-instruct/Unsloth-Mistral-7B-Instruct-v0.3/metrics_7312.json
+      size: 309
+  evaluate_pl@pl-court-instruct-Unsloth-Mistral-7B-Instruct-v0.3-997:
+    cmd: PYTHONPATH=. python scripts/sft/evaluate.py --output-file data/experiments/predict/pl-court-instruct/Unsloth-Mistral-7B-Instruct-v0.3/outputs_997.json
       --num-proc=-1
     deps:
-    - path: 
-        data/experiments/predict/en-court-instruct/Unsloth-Llama-3-8B-Instruct-fine-tuned-en/outputs_42.json
-      hash: md5
-      md5: 4246a4fafba5e130aac3db6c1c61ce30
-      size: 675578
-    - path: scripts/sft/evaluate.py
-      hash: md5
+    - hash: md5
+      md5: bbb883aa388b274bef3e9296df26f68f
+      path: data/experiments/predict/pl-court-instruct/Unsloth-Mistral-7B-Instruct-v0.3/outputs_997.json
+      size: 1795752
+    - hash: md5
       md5: 0644efb76af2c5461185e37a07ba2c17
+      path: scripts/sft/evaluate.py
       size: 697
     outs:
-    - path: 
-        data/experiments/predict/en-court-instruct/Unsloth-Llama-3-8B-Instruct-fine-tuned-en/metrics_42.json
-      hash: md5
-      md5: 016d1c87b2925c6f941400d178bee018
-      size: 157
-  evaluate_en@en-court-instruct-Unsloth-Llama-3-8B-Instruct-fine-tuned-en-7312:
-    cmd: PYTHONPATH=. python scripts/sft/evaluate.py --output-file 
-      data/experiments/predict/en-court-instruct/Unsloth-Llama-3-8B-Instruct-fine-tuned-en/outputs_7312.json
+    - hash: md5
+      md5: 533df4f640eb0699b5382cc759e0a45d
+      path: data/experiments/predict/pl-court-instruct/Unsloth-Mistral-7B-Instruct-v0.3/metrics_997.json
+      size: 310
+  evaluate_pl@pl-court-instruct-Unsloth-Mistral-7B-Instruct-v0.3-fine-tuned-42:
+    cmd: PYTHONPATH=. python scripts/sft/evaluate.py --output-file data/experiments/predict/pl-court-instruct/Unsloth-Mistral-7B-Instruct-v0.3-fine-tuned/outputs_42.json
       --num-proc=-1
     deps:
-    - path: 
-        data/experiments/predict/en-court-instruct/Unsloth-Llama-3-8B-Instruct-fine-tuned-en/outputs_7312.json
-      hash: md5
-      md5: f0b806eebca2f3ddf49d0ff821856b45
-      size: 670935
-    - path: scripts/sft/evaluate.py
-      hash: md5
+    - hash: md5
+      md5: 156091297490d893f9815d2ffcf17cbf
+      path: data/experiments/predict/pl-court-instruct/Unsloth-Mistral-7B-Instruct-v0.3-fine-tuned/outputs_42.json
+      size: 1792160
+    - hash: md5
       md5: 0644efb76af2c5461185e37a07ba2c17
+      path: scripts/sft/evaluate.py
       size: 697
     outs:
-    - path: 
-        data/experiments/predict/en-court-instruct/Unsloth-Llama-3-8B-Instruct-fine-tuned-en/metrics_7312.json
-      hash: md5
-      md5: a8459393feb773fea85ede4b831b3fa6
-      size: 157
-  evaluate_en@en-court-instruct-Unsloth-Llama-3-8B-Instruct-fine-tuned-en-997:
-    cmd: PYTHONPATH=. python scripts/sft/evaluate.py --output-file 
-      data/experiments/predict/en-court-instruct/Unsloth-Llama-3-8B-Instruct-fine-tuned-en/outputs_997.json
+    - hash: md5
+      md5: 89338c144457d5297d844c5f9b341f9f
+      path: data/experiments/predict/pl-court-instruct/Unsloth-Mistral-7B-Instruct-v0.3-fine-tuned/metrics_42.json
+      size: 307
+  evaluate_pl@pl-court-instruct-Unsloth-Mistral-7B-Instruct-v0.3-fine-tuned-7312:
+    cmd: PYTHONPATH=. python scripts/sft/evaluate.py --output-file data/experiments/predict/pl-court-instruct/Unsloth-Mistral-7B-Instruct-v0.3-fine-tuned/outputs_7312.json
       --num-proc=-1
     deps:
-    - path: 
-        data/experiments/predict/en-court-instruct/Unsloth-Llama-3-8B-Instruct-fine-tuned-en/outputs_997.json
-      hash: md5
-      md5: 4e968cac351ad48ad786d1ecccbbc967
-      size: 670674
-    - path: scripts/sft/evaluate.py
-      hash: md5
+    - hash: md5
+      md5: 3d336675e54a706fae45349adbaf6ee4
+      path: data/experiments/predict/pl-court-instruct/Unsloth-Mistral-7B-Instruct-v0.3-fine-tuned/outputs_7312.json
+      size: 1793461
+    - hash: md5
       md5: 0644efb76af2c5461185e37a07ba2c17
+      path: scripts/sft/evaluate.py
       size: 697
     outs:
-    - path: 
-        data/experiments/predict/en-court-instruct/Unsloth-Llama-3-8B-Instruct-fine-tuned-en/metrics_997.json
-      hash: md5
-      md5: 21bc79aad7ab2e97b75e1d3fb18a2263
-      size: 157
-  evaluate_en@en-court-instruct-Unsloth-Mistral-Nemo-Instruct-2407-42:
-    cmd: PYTHONPATH=. python scripts/sft/evaluate.py --output-file 
-      data/experiments/predict/en-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/outputs_42.json
+    - hash: md5
+      md5: 8c6ac6f31de90a1fb08d73d08a8544dc
+      path: data/experiments/predict/pl-court-instruct/Unsloth-Mistral-7B-Instruct-v0.3-fine-tuned/metrics_7312.json
+      size: 305
+  evaluate_pl@pl-court-instruct-Unsloth-Mistral-7B-Instruct-v0.3-fine-tuned-997:
+    cmd: PYTHONPATH=. python scripts/sft/evaluate.py --output-file data/experiments/predict/pl-court-instruct/Unsloth-Mistral-7B-Instruct-v0.3-fine-tuned/outputs_997.json
       --num-proc=-1
     deps:
-    - path: 
-        data/experiments/predict/en-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/outputs_42.json
-      hash: md5
-      md5: 4fe25ad80a20ea5d6200136176b3e4ca
-      size: 705218
-    - path: scripts/sft/evaluate.py
-      hash: md5
+    - hash: md5
+      md5: 55d682fba1c08c68552e98be6b503b4e
+      path: data/experiments/predict/pl-court-instruct/Unsloth-Mistral-7B-Instruct-v0.3-fine-tuned/outputs_997.json
+      size: 1790731
+    - hash: md5
       md5: 0644efb76af2c5461185e37a07ba2c17
+      path: scripts/sft/evaluate.py
       size: 697
     outs:
-    - path: 
-        data/experiments/predict/en-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/metrics_42.json
-      hash: md5
-      md5: 0b2f663a1cbc3ef08c363ec8adc53c15
-      size: 151
-  evaluate_en@en-court-instruct-Unsloth-Mistral-Nemo-Instruct-2407-7312:
-    cmd: PYTHONPATH=. python scripts/sft/evaluate.py --output-file 
-      data/experiments/predict/en-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/outputs_7312.json
+    - hash: md5
+      md5: 9873757edce9412fdee1ef45513f26ac
+      path: data/experiments/predict/pl-court-instruct/Unsloth-Mistral-7B-Instruct-v0.3-fine-tuned/metrics_997.json
+      size: 307
+  evaluate_pl@pl-court-instruct-Unsloth-Mistral-Nemo-Instruct-2407-42:
+    cmd: PYTHONPATH=. python scripts/sft/evaluate.py --output-file data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/outputs_42.json
       --num-proc=-1
     deps:
-    - path: 
-        data/experiments/predict/en-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/outputs_7312.json
-      hash: md5
-      md5: cf4fdbf0e26e6c793bdca4edd6e365c0
-      size: 703876
-    - path: scripts/sft/evaluate.py
-      hash: md5
+    - hash: md5
+      md5: 1385f49966e9db2a88a17f53d0887ad8
+      path: data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/outputs_42.json
+      size: 1741944
+    - hash: md5
       md5: 0644efb76af2c5461185e37a07ba2c17
+      path: scripts/sft/evaluate.py
       size: 697
     outs:
-    - path: 
-        data/experiments/predict/en-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/metrics_7312.json
-      hash: md5
-      md5: 604b5cee14ec6520b88bafecc962e031
-      size: 152
-  evaluate_en@en-court-instruct-Unsloth-Mistral-Nemo-Instruct-2407-997:
-    cmd: PYTHONPATH=. python scripts/sft/evaluate.py --output-file 
-      data/experiments/predict/en-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/outputs_997.json
+    - hash: md5
+      md5: 37165fb96f31997589e95108f6c149a3
+      path: data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/metrics_42.json
+      size: 306
+  evaluate_pl@pl-court-instruct-Unsloth-Mistral-Nemo-Instruct-2407-7312:
+    cmd: PYTHONPATH=. python scripts/sft/evaluate.py --output-file data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/outputs_7312.json
       --num-proc=-1
     deps:
-    - path: 
-        data/experiments/predict/en-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/outputs_997.json
-      hash: md5
-      md5: 94c30cf8fe7db71afc58a5c9cdbc0d9f
-      size: 705894
-    - path: scripts/sft/evaluate.py
-      hash: md5
+    - hash: md5
+      md5: 924744efce1483e9128579cad7a4454c
+      path: data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/outputs_7312.json
+      size: 1748772
+    - hash: md5
       md5: 0644efb76af2c5461185e37a07ba2c17
+      path: scripts/sft/evaluate.py
       size: 697
     outs:
-    - path: 
-        data/experiments/predict/en-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/metrics_997.json
-      hash: md5
-      md5: a91ec5b434bebd8ce1d2000e0a033cb9
-      size: 152
-  evaluate_en@en-court-instruct-Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned-en-42:
-    cmd: PYTHONPATH=. python scripts/sft/evaluate.py --output-file 
-      data/experiments/predict/en-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned-en/outputs_42.json
+    - hash: md5
+      md5: 2ea9afa67cf34dc1629a265bce2c3357
+      path: data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/metrics_7312.json
+      size: 306
+  evaluate_pl@pl-court-instruct-Unsloth-Mistral-Nemo-Instruct-2407-997:
+    cmd: PYTHONPATH=. python scripts/sft/evaluate.py --output-file data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/outputs_997.json
       --num-proc=-1
     deps:
-    - path: 
-        data/experiments/predict/en-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned-en/outputs_42.json
-      hash: md5
-      md5: 313fa5a662f37cacae4980a04830f422
-      size: 642688
-    - path: scripts/sft/evaluate.py
-      hash: md5
+    - hash: md5
+      md5: 4d023797a9053fd7df61f6b1796112e9
+      path: data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/outputs_997.json
+      size: 1747404
+    - hash: md5
       md5: 0644efb76af2c5461185e37a07ba2c17
+      path: scripts/sft/evaluate.py
       size: 697
     outs:
-    - path: 
-        data/experiments/predict/en-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned-en/metrics_42.json
-      hash: md5
-      md5: f0d37c5ac017c0e488b7c3bed01c7093
-      size: 156
-  evaluate_en@en-court-instruct-Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned-en-7312:
-    cmd: PYTHONPATH=. python scripts/sft/evaluate.py --output-file 
-      data/experiments/predict/en-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned-en/outputs_7312.json
+    - hash: md5
+      md5: 122c6d789f343d4e7232d8720b0f577a
+      path: data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/metrics_997.json
+      size: 303
+  evaluate_pl@pl-court-instruct-Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned-42:
+    cmd: PYTHONPATH=. python scripts/sft/evaluate.py --output-file data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned/outputs_42.json
       --num-proc=-1
     deps:
-    - path: 
-        data/experiments/predict/en-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned-en/outputs_7312.json
-      hash: md5
-      md5: 4ed8db93aa14f1cc98e276d3989efa9e
-      size: 642730
-    - path: scripts/sft/evaluate.py
-      hash: md5
+    - hash: md5
+      md5: 14d4613f7d9495f5fb5f2d7b81f402a9
+      path: data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned/outputs_42.json
+      size: 1825646
+    - hash: md5
       md5: 0644efb76af2c5461185e37a07ba2c17
+      path: scripts/sft/evaluate.py
       size: 697
     outs:
-    - path: 
-        data/experiments/predict/en-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned-en/metrics_7312.json
-      hash: md5
-      md5: a1521ab06a56258759953bb02ae87e24
-      size: 157
-  evaluate_en@en-court-instruct-Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned-en-997:
-    cmd: PYTHONPATH=. python scripts/sft/evaluate.py --output-file 
-      data/experiments/predict/en-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned-en/outputs_997.json
+    - hash: md5
+      md5: e43e3fe605787204ba1345dedaefd124
+      path: data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned/metrics_42.json
+      size: 305
+  evaluate_pl@pl-court-instruct-Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned-7312:
+    cmd: PYTHONPATH=. python scripts/sft/evaluate.py --output-file data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned/outputs_7312.json
       --num-proc=-1
     deps:
-    - path: 
-        data/experiments/predict/en-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned-en/outputs_997.json
-      hash: md5
-      md5: 787c129090aa1b64e337b236a4391402
-      size: 642477
-    - path: scripts/sft/evaluate.py
-      hash: md5
+    - hash: md5
+      md5: 302e1dc4f064007e3df88ac1e8acccc5
+      path: data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned/outputs_7312.json
+      size: 1831330
+    - hash: md5
       md5: 0644efb76af2c5461185e37a07ba2c17
+      path: scripts/sft/evaluate.py
       size: 697
     outs:
-    - path: 
-        data/experiments/predict/en-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned-en/metrics_997.json
-      hash: md5
-      md5: f3339245ea358de4b1348c8393153946
-      size: 157
-  evaluate_en@en-court-instruct-open_ai_gpt-4o-mini-997:
-    cmd: PYTHONPATH=. python scripts/sft/evaluate.py --output-file data/experiments/predict/en-court-instruct/open_ai_gpt-4o-mini/outputs_997.json
+    - hash: md5
+      md5: b5d0fa11fe5557bbe1ee9804d5d09cb1
+      path: data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned/metrics_7312.json
+      size: 303
+  evaluate_pl@pl-court-instruct-Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned-997:
+    cmd: PYTHONPATH=. python scripts/sft/evaluate.py --output-file data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned/outputs_997.json
+      --num-proc=-1
     deps:
-    - path: data/experiments/predict/en-court-instruct/open_ai_gpt-4o-mini/outputs_997.json
-      hash: md5
-      md5: 2a0819011b3eac56e497201a9f67e310
-      size: 690306
-    - path: scripts/sft/evaluate.py
-      hash: md5
-      md5: 73aa4a7eb8a035c087702457b9401654
-      size: 636
+    - hash: md5
+      md5: 41a47dc56efc29b6c2771db68bdacb17
+      path: data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned/outputs_997.json
+      size: 1822491
+    - hash: md5
+      md5: 0644efb76af2c5461185e37a07ba2c17
+      path: scripts/sft/evaluate.py
+      size: 697
     outs:
-    - path: data/experiments/predict/en-court-instruct/open_ai_gpt-4o-mini/metrics_997.json
-      hash: md5
-      md5: d70eb0821aff9c9e874a421b80f7f697
-      size: 155
-  evaluate@pl-court-instruct-qra-13b-42:
+    - hash: md5
+      md5: 403fc36606ced0ab31d34d3d548f948e
+      path: data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned/metrics_997.json
+      size: 303
+  evaluate_pl@pl-court-instruct-qra-13b-42:
     cmd: PYTHONPATH=. python scripts/sft/evaluate.py --output-file data/experiments/predict/pl-court-instruct/qra-13b/outputs_42.json
       --num-proc=-1
     deps:
-    - path: data/experiments/predict/pl-court-instruct/qra-13b/outputs_42.json
-      hash: md5
+    - hash: md5
       md5: dd142d2d1c24c499bbe615bf4b74525c
+      path: data/experiments/predict/pl-court-instruct/qra-13b/outputs_42.json
       size: 2247396
-    - path: scripts/sft/evaluate.py
-      hash: md5
+    - hash: md5
       md5: 0644efb76af2c5461185e37a07ba2c17
+      path: scripts/sft/evaluate.py
       size: 697
     outs:
-    - path: data/experiments/predict/pl-court-instruct/qra-13b/metrics_42.json
-      hash: md5
+    - hash: md5
       md5: 861d59d796c9957aba2973741fd77d65
+      path: data/experiments/predict/pl-court-instruct/qra-13b/metrics_42.json
       size: 202
-  evaluate@pl-court-instruct-qra-13b-7312:
+  evaluate_pl@pl-court-instruct-qra-13b-7312:
     cmd: PYTHONPATH=. python scripts/sft/evaluate.py --output-file data/experiments/predict/pl-court-instruct/qra-13b/outputs_7312.json
       --num-proc=-1
     deps:
-    - path: data/experiments/predict/pl-court-instruct/qra-13b/outputs_7312.json
-      hash: md5
+    - hash: md5
       md5: fddb307b29b598df3786fc94d479e918
+      path: data/experiments/predict/pl-court-instruct/qra-13b/outputs_7312.json
       size: 2254243
-    - path: scripts/sft/evaluate.py
-      hash: md5
+    - hash: md5
       md5: 0644efb76af2c5461185e37a07ba2c17
+      path: scripts/sft/evaluate.py
       size: 697
     outs:
-    - path: data/experiments/predict/pl-court-instruct/qra-13b/metrics_7312.json
-      hash: md5
+    - hash: md5
       md5: 78daedab21d1748c95b7308b423e6e73
+      path: data/experiments/predict/pl-court-instruct/qra-13b/metrics_7312.json
       size: 201
-  evaluate@pl-court-instruct-qra-13b-997:
+  evaluate_pl@pl-court-instruct-qra-13b-997:
     cmd: PYTHONPATH=. python scripts/sft/evaluate.py --output-file data/experiments/predict/pl-court-instruct/qra-13b/outputs_997.json
       --num-proc=-1
     deps:
-    - path: data/experiments/predict/pl-court-instruct/qra-13b/outputs_997.json
-      hash: md5
+    - hash: md5
       md5: 72ef8a411b8f5aeb006c99e5868c754d
+      path: data/experiments/predict/pl-court-instruct/qra-13b/outputs_997.json
       size: 2252480
-    - path: scripts/sft/evaluate.py
-      hash: md5
+    - hash: md5
       md5: 0644efb76af2c5461185e37a07ba2c17
+      path: scripts/sft/evaluate.py
       size: 697
     outs:
-    - path: data/experiments/predict/pl-court-instruct/qra-13b/metrics_997.json
-      hash: md5
+    - hash: md5
       md5: c3b7776073786447d84bd5200c39ecb9
+      path: data/experiments/predict/pl-court-instruct/qra-13b/metrics_997.json
       size: 201
+  evaluate_pl@pl-court-instruct-trurl-13B-academic-42:
+    cmd: PYTHONPATH=. python scripts/sft/evaluate.py --output-file data/experiments/predict/pl-court-instruct/trurl-13B-academic/outputs_42.json
+      --num-proc=-1
+    deps:
+    - hash: md5
+      md5: bb571102170940efc73f02143a530d5b
+      path: data/experiments/predict/pl-court-instruct/trurl-13B-academic/outputs_42.json
+      size: 1289839
+    - hash: md5
+      md5: 0644efb76af2c5461185e37a07ba2c17
+      path: scripts/sft/evaluate.py
+      size: 697
+    outs:
+    - hash: md5
+      md5: 5bb8bd6918ec3d91437d3465cbbee127
+      path: data/experiments/predict/pl-court-instruct/trurl-13B-academic/metrics_42.json
+      size: 311
+  evaluate_pl@pl-court-instruct-trurl-13B-academic-7312:
+    cmd: PYTHONPATH=. python scripts/sft/evaluate.py --output-file data/experiments/predict/pl-court-instruct/trurl-13B-academic/outputs_7312.json
+      --num-proc=-1
+    deps:
+    - hash: md5
+      md5: bcd41ca4629d4cec2440a8ed2f02560f
+      path: data/experiments/predict/pl-court-instruct/trurl-13B-academic/outputs_7312.json
+      size: 1283974
+    - hash: md5
+      md5: 0644efb76af2c5461185e37a07ba2c17
+      path: scripts/sft/evaluate.py
+      size: 697
+    outs:
+    - hash: md5
+      md5: da990070981d0524aa31916fad80e0eb
+      path: data/experiments/predict/pl-court-instruct/trurl-13B-academic/metrics_7312.json
+      size: 313
+  evaluate_pl@pl-court-instruct-trurl-13B-academic-997:
+    cmd: PYTHONPATH=. python scripts/sft/evaluate.py --output-file data/experiments/predict/pl-court-instruct/trurl-13B-academic/outputs_997.json
+      --num-proc=-1
+    deps:
+    - hash: md5
+      md5: 731cff0eb1484682de211336efeff153
+      path: data/experiments/predict/pl-court-instruct/trurl-13B-academic/outputs_997.json
+      size: 1288941
+    - hash: md5
+      md5: 0644efb76af2c5461185e37a07ba2c17
+      path: scripts/sft/evaluate.py
+      size: 697
+    outs:
+    - hash: md5
+      md5: 4d080092735994614eaf7125e5fe5bf2
+      path: data/experiments/predict/pl-court-instruct/trurl-13B-academic/metrics_997.json
+      size: 313
+  instruct_dataset_readme:
+    cmd: jupyter nbconvert  --no-input  --to markdown  --execute nbs/Data/03_Dataset_Description_Instruct.ipynb
+      --output-dir data/datasets/pl/readme/instruct --output README
+    deps:
+    - hash: md5
+      md5: 27e6d517445028d45e5c40b22febece4
+      path: nbs/Data/03_Dataset_Description_Instruct.ipynb
+      size: 16215
+    outs:
+    - hash: md5
+      md5: de02794df3d74d86f8610f040a17dcbe.dir
+      nfiles: 5
+      path: data/datasets/pl/readme/instruct/
+      size: 144326
   predict_en@en-court-instruct-Unsloth-Llama-3-8B-Instruct-42:
     cmd: PYTHONPATH=. python scripts/sft/predict.py dataset=en-court-instruct model=Unsloth-Llama-3-8B-Instruct
-      random_seed=42 
-      output_file=data/experiments/predict/en-court-instruct/Unsloth-Llama-3-8B-Instruct/outputs_42.json
+      random_seed=42 output_file=data/experiments/predict/en-court-instruct/Unsloth-Llama-3-8B-Instruct/outputs_42.json
     deps:
-    - path: configs/model/Unsloth-Llama-3-8B-Instruct.yaml
-      hash: md5
+    - hash: md5
       md5: 56a95874b3e77e7ffec11c00330da5b6
+      path: configs/model/Unsloth-Llama-3-8B-Instruct.yaml
       size: 176
-    - path: configs/predict.yaml
-      hash: md5
+    - hash: md5
       md5: 5fc8b9ac571d4a2209d7d866697252ab
+      path: configs/predict.yaml
       size: 402
-    - path: scripts/sft/predict.py
-      hash: md5
+    - hash: md5
       md5: f9acd63cd4d682ae2242d7b51f0d974b
+      path: scripts/sft/predict.py
       size: 3198
     outs:
-    - path: 
-        data/experiments/predict/en-court-instruct/Unsloth-Llama-3-8B-Instruct/outputs_42.json
-      hash: md5
+    - hash: md5
       md5: 761018c0a306fbee63dad2fbc119110d
+      path: data/experiments/predict/en-court-instruct/Unsloth-Llama-3-8B-Instruct/outputs_42.json
       size: 821683
   predict_en@en-court-instruct-Unsloth-Llama-3-8B-Instruct-7312:
     cmd: PYTHONPATH=. python scripts/sft/predict.py dataset=en-court-instruct model=Unsloth-Llama-3-8B-Instruct
-      random_seed=7312 
-      output_file=data/experiments/predict/en-court-instruct/Unsloth-Llama-3-8B-Instruct/outputs_7312.json
+      random_seed=7312 output_file=data/experiments/predict/en-court-instruct/Unsloth-Llama-3-8B-Instruct/outputs_7312.json
     deps:
-    - path: configs/model/Unsloth-Llama-3-8B-Instruct.yaml
-      hash: md5
+    - hash: md5
       md5: 56a95874b3e77e7ffec11c00330da5b6
+      path: configs/model/Unsloth-Llama-3-8B-Instruct.yaml
       size: 176
-    - path: configs/predict.yaml
-      hash: md5
+    - hash: md5
       md5: 5fc8b9ac571d4a2209d7d866697252ab
+      path: configs/predict.yaml
       size: 402
-    - path: scripts/sft/predict.py
-      hash: md5
+    - hash: md5
       md5: f9acd63cd4d682ae2242d7b51f0d974b
+      path: scripts/sft/predict.py
       size: 3198
     outs:
-    - path: 
-        data/experiments/predict/en-court-instruct/Unsloth-Llama-3-8B-Instruct/outputs_7312.json
-      hash: md5
+    - hash: md5
       md5: a7361535b440251d6ce6232a15cfcdf2
+      path: data/experiments/predict/en-court-instruct/Unsloth-Llama-3-8B-Instruct/outputs_7312.json
       size: 818877
   predict_en@en-court-instruct-Unsloth-Llama-3-8B-Instruct-997:
     cmd: PYTHONPATH=. python scripts/sft/predict.py dataset=en-court-instruct model=Unsloth-Llama-3-8B-Instruct
-      random_seed=997 
-      output_file=data/experiments/predict/en-court-instruct/Unsloth-Llama-3-8B-Instruct/outputs_997.json
+      random_seed=997 output_file=data/experiments/predict/en-court-instruct/Unsloth-Llama-3-8B-Instruct/outputs_997.json
     deps:
-    - path: configs/model/Unsloth-Llama-3-8B-Instruct.yaml
-      hash: md5
+    - hash: md5
       md5: 56a95874b3e77e7ffec11c00330da5b6
+      path: configs/model/Unsloth-Llama-3-8B-Instruct.yaml
       size: 176
-    - path: configs/predict.yaml
-      hash: md5
+    - hash: md5
       md5: 5fc8b9ac571d4a2209d7d866697252ab
+      path: configs/predict.yaml
       size: 402
-    - path: scripts/sft/predict.py
-      hash: md5
+    - hash: md5
       md5: f9acd63cd4d682ae2242d7b51f0d974b
+      path: scripts/sft/predict.py
       size: 3198
     outs:
-    - path: 
-        data/experiments/predict/en-court-instruct/Unsloth-Llama-3-8B-Instruct/outputs_997.json
-      hash: md5
+    - hash: md5
       md5: 94924275d576271875fecf22c0f9b39e
+      path: data/experiments/predict/en-court-instruct/Unsloth-Llama-3-8B-Instruct/outputs_997.json
       size: 817490
+  predict_en@en-court-instruct-Unsloth-Llama-3-8B-Instruct-fine-tuned-en-42:
+    cmd: PYTHONPATH=. python scripts/sft/predict.py dataset=en-court-instruct model=Unsloth-Llama-3-8B-Instruct-fine-tuned-en
+      random_seed=42 output_file=data/experiments/predict/en-court-instruct/Unsloth-Llama-3-8B-Instruct-fine-tuned-en/outputs_42.json
+    deps:
+    - hash: md5
+      md5: 67faffd57ec54260d70eb3a89d2ec130
+      path: configs/model/Unsloth-Llama-3-8B-Instruct-fine-tuned-en.yaml
+      size: 259
+    - hash: md5
+      md5: 5fc8b9ac571d4a2209d7d866697252ab
+      path: configs/predict.yaml
+      size: 402
+    - hash: md5
+      md5: f9acd63cd4d682ae2242d7b51f0d974b
+      path: scripts/sft/predict.py
+      size: 3198
+    outs:
+    - hash: md5
+      md5: 4246a4fafba5e130aac3db6c1c61ce30
+      path: data/experiments/predict/en-court-instruct/Unsloth-Llama-3-8B-Instruct-fine-tuned-en/outputs_42.json
+      size: 675578
+  predict_en@en-court-instruct-Unsloth-Llama-3-8B-Instruct-fine-tuned-en-7312:
+    cmd: PYTHONPATH=. python scripts/sft/predict.py dataset=en-court-instruct model=Unsloth-Llama-3-8B-Instruct-fine-tuned-en
+      random_seed=7312 output_file=data/experiments/predict/en-court-instruct/Unsloth-Llama-3-8B-Instruct-fine-tuned-en/outputs_7312.json
+    deps:
+    - hash: md5
+      md5: 67faffd57ec54260d70eb3a89d2ec130
+      path: configs/model/Unsloth-Llama-3-8B-Instruct-fine-tuned-en.yaml
+      size: 259
+    - hash: md5
+      md5: 5fc8b9ac571d4a2209d7d866697252ab
+      path: configs/predict.yaml
+      size: 402
+    - hash: md5
+      md5: f9acd63cd4d682ae2242d7b51f0d974b
+      path: scripts/sft/predict.py
+      size: 3198
+    outs:
+    - hash: md5
+      md5: f0b806eebca2f3ddf49d0ff821856b45
+      path: data/experiments/predict/en-court-instruct/Unsloth-Llama-3-8B-Instruct-fine-tuned-en/outputs_7312.json
+      size: 670935
+  predict_en@en-court-instruct-Unsloth-Llama-3-8B-Instruct-fine-tuned-en-997:
+    cmd: PYTHONPATH=. python scripts/sft/predict.py dataset=en-court-instruct model=Unsloth-Llama-3-8B-Instruct-fine-tuned-en
+      random_seed=997 output_file=data/experiments/predict/en-court-instruct/Unsloth-Llama-3-8B-Instruct-fine-tuned-en/outputs_997.json
+    deps:
+    - hash: md5
+      md5: 67faffd57ec54260d70eb3a89d2ec130
+      path: configs/model/Unsloth-Llama-3-8B-Instruct-fine-tuned-en.yaml
+      size: 259
+    - hash: md5
+      md5: 5fc8b9ac571d4a2209d7d866697252ab
+      path: configs/predict.yaml
+      size: 402
+    - hash: md5
+      md5: f9acd63cd4d682ae2242d7b51f0d974b
+      path: scripts/sft/predict.py
+      size: 3198
+    outs:
+    - hash: md5
+      md5: 4e968cac351ad48ad786d1ecccbbc967
+      path: data/experiments/predict/en-court-instruct/Unsloth-Llama-3-8B-Instruct-fine-tuned-en/outputs_997.json
+      size: 670674
   predict_en@en-court-instruct-Unsloth-Mistral-Nemo-Instruct-2407-42:
     cmd: PYTHONPATH=. python scripts/sft/predict.py dataset=en-court-instruct model=Unsloth-Mistral-Nemo-Instruct-2407
-      random_seed=42 
-      output_file=data/experiments/predict/en-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/outputs_42.json
+      random_seed=42 output_file=data/experiments/predict/en-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/outputs_42.json
     deps:
-    - path: configs/model/Unsloth-Mistral-Nemo-Instruct-2407.yaml
-      hash: md5
+    - hash: md5
       md5: ca5ac52e503c9f488f98f569811c76dc
+      path: configs/model/Unsloth-Mistral-Nemo-Instruct-2407.yaml
       size: 261
-    - path: configs/predict.yaml
-      hash: md5
+    - hash: md5
       md5: 5fc8b9ac571d4a2209d7d866697252ab
+      path: configs/predict.yaml
       size: 402
-    - path: scripts/sft/predict.py
-      hash: md5
+    - hash: md5
       md5: f9acd63cd4d682ae2242d7b51f0d974b
+      path: scripts/sft/predict.py
       size: 3198
     outs:
-    - path: 
-        data/experiments/predict/en-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/outputs_42.json
-      hash: md5
+    - hash: md5
       md5: 4fe25ad80a20ea5d6200136176b3e4ca
+      path: data/experiments/predict/en-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/outputs_42.json
       size: 705218
   predict_en@en-court-instruct-Unsloth-Mistral-Nemo-Instruct-2407-7312:
     cmd: PYTHONPATH=. python scripts/sft/predict.py dataset=en-court-instruct model=Unsloth-Mistral-Nemo-Instruct-2407
-      random_seed=7312 
-      output_file=data/experiments/predict/en-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/outputs_7312.json
+      random_seed=7312 output_file=data/experiments/predict/en-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/outputs_7312.json
     deps:
-    - path: configs/model/Unsloth-Mistral-Nemo-Instruct-2407.yaml
-      hash: md5
+    - hash: md5
       md5: ca5ac52e503c9f488f98f569811c76dc
+      path: configs/model/Unsloth-Mistral-Nemo-Instruct-2407.yaml
       size: 261
-    - path: configs/predict.yaml
-      hash: md5
+    - hash: md5
       md5: 5fc8b9ac571d4a2209d7d866697252ab
+      path: configs/predict.yaml
       size: 402
-    - path: scripts/sft/predict.py
-      hash: md5
+    - hash: md5
       md5: f9acd63cd4d682ae2242d7b51f0d974b
+      path: scripts/sft/predict.py
       size: 3198
     outs:
-    - path: 
-        data/experiments/predict/en-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/outputs_7312.json
-      hash: md5
+    - hash: md5
       md5: cf4fdbf0e26e6c793bdca4edd6e365c0
+      path: data/experiments/predict/en-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/outputs_7312.json
       size: 703876
   predict_en@en-court-instruct-Unsloth-Mistral-Nemo-Instruct-2407-997:
     cmd: PYTHONPATH=. python scripts/sft/predict.py dataset=en-court-instruct model=Unsloth-Mistral-Nemo-Instruct-2407
-      random_seed=997 
-      output_file=data/experiments/predict/en-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/outputs_997.json
+      random_seed=997 output_file=data/experiments/predict/en-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/outputs_997.json
     deps:
-    - path: configs/model/Unsloth-Mistral-Nemo-Instruct-2407.yaml
-      hash: md5
+    - hash: md5
       md5: ca5ac52e503c9f488f98f569811c76dc
+      path: configs/model/Unsloth-Mistral-Nemo-Instruct-2407.yaml
       size: 261
-    - path: configs/predict.yaml
-      hash: md5
+    - hash: md5
       md5: 5fc8b9ac571d4a2209d7d866697252ab
+      path: configs/predict.yaml
       size: 402
-    - path: scripts/sft/predict.py
-      hash: md5
+    - hash: md5
       md5: f9acd63cd4d682ae2242d7b51f0d974b
+      path: scripts/sft/predict.py
       size: 3198
     outs:
-    - path: 
-        data/experiments/predict/en-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/outputs_997.json
-      hash: md5
+    - hash: md5
       md5: 94c30cf8fe7db71afc58a5c9cdbc0d9f
+      path: data/experiments/predict/en-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/outputs_997.json
       size: 705894
-  predict_with_api@pl-court-instruct-gpt-4o-997:
-    cmd: PYTHONPATH=. python scripts/sft/predict_with_api.py dataset=pl-court-instruct
-      model_version=gpt-4o seed=997 
-      output_file=data/experiments/predict/pl-court-instruct/open_ai_gpt-4o/outputs_997.json
-    deps:
-    - path: configs/predict_with_api.yaml
-      hash: md5
-      md5: aff18078742a14c3d8ce2cd74e718d44
-      size: 320
-    - path: scripts/sft/predict_with_api.py
-      hash: md5
-      md5: 142508c7b6df391083b0e81a3a6c4795
-      size: 3968
-    outs:
-    - path: data/experiments/predict/pl-court-instruct/open_ai_gpt-4o/outputs_997.json
-      hash: md5
-      md5: 7c5833fdd1419163b286baaa3d71e084
-      size: 1965252
-  predict_with_api@pl-court-instruct-gpt-4o-mini-997:
-    cmd: PYTHONPATH=. python scripts/sft/predict_with_api.py dataset=pl-court-instruct
-      model_version=gpt-4o-mini seed=997 
-      output_file=data/experiments/predict/pl-court-instruct/open_ai_gpt-4o-mini/outputs_997.json
-    deps:
-    - path: configs/predict_with_api.yaml
-      hash: md5
-      md5: aff18078742a14c3d8ce2cd74e718d44
-      size: 320
-    - path: scripts/sft/predict_with_api.py
-      hash: md5
-      md5: 142508c7b6df391083b0e81a3a6c4795
-      size: 3968
-    outs:
-    - path: data/experiments/predict/pl-court-instruct/open_ai_gpt-4o-mini/outputs_997.json
-      hash: md5
-      md5: 839c911f542cd7c60c9ae52ef95e9907
-      size: 1812429
-  predict_with_api@en-court-instruct-gpt-4o-997:
-    cmd: PYTHONPATH=. python scripts/sft/predict_with_api.py dataset=en-court-instruct
-      model_version=gpt-4o seed=997 
-      output_file=data/experiments/predict/en-court-instruct/open_ai_gpt-4o/outputs_997.json
+  predict_en@en-court-instruct-Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned-en-42:
+    cmd: PYTHONPATH=. python scripts/sft/predict.py dataset=en-court-instruct model=Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned-en
+      random_seed=42 output_file=data/experiments/predict/en-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned-en/outputs_42.json
     deps:
-    - path: configs/predict_with_api.yaml
-      hash: md5
-      md5: aff18078742a14c3d8ce2cd74e718d44
-      size: 320
-    - path: scripts/sft/predict_with_api.py
-      hash: md5
-      md5: 142508c7b6df391083b0e81a3a6c4795
-      size: 3968
+    - hash: md5
+      md5: 5986ff103292733bff4662585ae5d860
+      path: configs/model/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned-en.yaml
+      size: 351
+    - hash: md5
+      md5: 5fc8b9ac571d4a2209d7d866697252ab
+      path: configs/predict.yaml
+      size: 402
+    - hash: md5
+      md5: f9acd63cd4d682ae2242d7b51f0d974b
+      path: scripts/sft/predict.py
+      size: 3198
     outs:
-    - path: data/experiments/predict/en-court-instruct/open_ai_gpt-4o/outputs_997.json
-      hash: md5
-      md5: 8f70e2baa0b0ae8a320577f5c8a60011
-      size: 679432
-  evaluate@pl-court-instruct-open_ai_gpt-4o-mini-997:
-    cmd: PYTHONPATH=. python scripts/sft/evaluate.py --output-file data/experiments/predict/pl-court-instruct/open_ai_gpt-4o-mini/outputs_997.json
+    - hash: md5
+      md5: 313fa5a662f37cacae4980a04830f422
+      path: data/experiments/predict/en-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned-en/outputs_42.json
+      size: 642688
+  predict_en@en-court-instruct-Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned-en-7312:
+    cmd: PYTHONPATH=. python scripts/sft/predict.py dataset=en-court-instruct model=Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned-en
+      random_seed=7312 output_file=data/experiments/predict/en-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned-en/outputs_7312.json
     deps:
-    - path: data/experiments/predict/pl-court-instruct/open_ai_gpt-4o-mini/outputs_997.json
-      hash: md5
-      md5: 839c911f542cd7c60c9ae52ef95e9907
-      size: 1812429
-    - path: scripts/sft/evaluate.py
-      hash: md5
-      md5: 73aa4a7eb8a035c087702457b9401654
-      size: 636
+    - hash: md5
+      md5: 5986ff103292733bff4662585ae5d860
+      path: configs/model/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned-en.yaml
+      size: 351
+    - hash: md5
+      md5: 5fc8b9ac571d4a2209d7d866697252ab
+      path: configs/predict.yaml
+      size: 402
+    - hash: md5
+      md5: f9acd63cd4d682ae2242d7b51f0d974b
+      path: scripts/sft/predict.py
+      size: 3198
     outs:
-    - path: data/experiments/predict/pl-court-instruct/open_ai_gpt-4o-mini/metrics_997.json
-      hash: md5
-      md5: fe43f0d25b500a0f2fb2d8199b8034fd
-      size: 305
-  evaluate@pl-court-instruct-open_ai_gpt-4o-997:
-    cmd: PYTHONPATH=. python scripts/sft/evaluate.py --output-file data/experiments/predict/pl-court-instruct/open_ai_gpt-4o/outputs_997.json
+    - hash: md5
+      md5: 4ed8db93aa14f1cc98e276d3989efa9e
+      path: data/experiments/predict/en-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned-en/outputs_7312.json
+      size: 642730
+  predict_en@en-court-instruct-Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned-en-997:
+    cmd: PYTHONPATH=. python scripts/sft/predict.py dataset=en-court-instruct model=Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned-en
+      random_seed=997 output_file=data/experiments/predict/en-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned-en/outputs_997.json
     deps:
-    - path: data/experiments/predict/pl-court-instruct/open_ai_gpt-4o/outputs_997.json
-      hash: md5
-      md5: 7c5833fdd1419163b286baaa3d71e084
-      size: 1965252
-    - path: scripts/sft/evaluate.py
-      hash: md5
-      md5: 73aa4a7eb8a035c087702457b9401654
-      size: 636
+    - hash: md5
+      md5: 5986ff103292733bff4662585ae5d860
+      path: configs/model/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned-en.yaml
+      size: 351
+    - hash: md5
+      md5: 5fc8b9ac571d4a2209d7d866697252ab
+      path: configs/predict.yaml
+      size: 402
+    - hash: md5
+      md5: f9acd63cd4d682ae2242d7b51f0d974b
+      path: scripts/sft/predict.py
+      size: 3198
     outs:
-    - path: data/experiments/predict/pl-court-instruct/open_ai_gpt-4o/metrics_997.json
-      hash: md5
-      md5: 65c808d4aebd8efe37b94a5128a19de6
-      size: 306
-  evaluate_en@en-court-instruct-open_ai_gpt-4o-997:
-    cmd: PYTHONPATH=. python scripts/sft/evaluate.py --output-file data/experiments/predict/en-court-instruct/open_ai_gpt-4o/outputs_997.json
+    - hash: md5
+      md5: 787c129090aa1b64e337b236a4391402
+      path: data/experiments/predict/en-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned-en/outputs_997.json
+      size: 642477
+  predict_pl@pl-court-instruct-Bielik-11B-v2.2-Instruct-42:
+    cmd: PYTHONPATH=. python scripts/sft/predict.py dataset=pl-court-instruct model=Bielik-11B-v2.2-Instruct
+      random_seed=42 output_file=data/experiments/predict/pl-court-instruct/Bielik-11B-v2.2-Instruct/outputs_42.json
     deps:
-    - path: data/experiments/predict/en-court-instruct/open_ai_gpt-4o/outputs_997.json
-      hash: md5
-      md5: 8f70e2baa0b0ae8a320577f5c8a60011
-      size: 679432
-    - path: scripts/sft/evaluate.py
-      hash: md5
-      md5: 73aa4a7eb8a035c087702457b9401654
-      size: 636
+    - hash: md5
+      md5: 1cfb3fbe30fac3e07a30339e6bf197c9
+      path: configs/model/Bielik-11B-v2.2-Instruct.yaml
+      size: 175
+    - hash: md5
+      md5: 5fc8b9ac571d4a2209d7d866697252ab
+      path: configs/predict.yaml
+      size: 402
+    - hash: md5
+      md5: f9acd63cd4d682ae2242d7b51f0d974b
+      path: scripts/sft/predict.py
+      size: 3198
     outs:
-    - path: data/experiments/predict/en-court-instruct/open_ai_gpt-4o/metrics_997.json
-      hash: md5
-      md5: ac30bcf3c40000cab61e0914b56aba85
-      size: 157
-  evaluate_llm_as_judge@gpt_4o_mini-Unsloth-Llama-3-8B-Instruct-42:
-    cmd: PYTHONPATH=. python scripts/sft/evaluate_llm_as_judge.py api_model=gpt_4o_mini
-      answers_file=data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct/outputs_42.json
-      out_metric_file=data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct/judge_metrics_42.json
-      prompt=pl
+    - hash: md5
+      md5: c3e404c898e3e193ac3aa910187b4f9f
+      path: data/experiments/predict/pl-court-instruct/Bielik-11B-v2.2-Instruct/outputs_42.json
+      size: 1734129
+  predict_pl@pl-court-instruct-Bielik-11B-v2.2-Instruct-7312:
+    cmd: PYTHONPATH=. python scripts/sft/predict.py dataset=pl-court-instruct model=Bielik-11B-v2.2-Instruct
+      random_seed=7312 output_file=data/experiments/predict/pl-court-instruct/Bielik-11B-v2.2-Instruct/outputs_7312.json
     deps:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct/outputs_42.json
-      hash: md5
-      md5: e99c88720116c951087b6125e5f4be4d
-      size: 2008073
-    - path: scripts/sft/evaluate_llm_as_judge.py
-      hash: md5
-      md5: 79a02fb864cb279f93fc4171043bb31c
-      size: 2253
+    - hash: md5
+      md5: 1cfb3fbe30fac3e07a30339e6bf197c9
+      path: configs/model/Bielik-11B-v2.2-Instruct.yaml
+      size: 175
+    - hash: md5
+      md5: 5fc8b9ac571d4a2209d7d866697252ab
+      path: configs/predict.yaml
+      size: 402
+    - hash: md5
+      md5: f9acd63cd4d682ae2242d7b51f0d974b
+      path: scripts/sft/predict.py
+      size: 3198
     outs:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct/judge_metrics_42.json
-      hash: md5
-      md5: 9d9fba0cf2169e9dd9f69579a2182b8e
-      size: 1172
-  predict@pl-court-instruct-Bielik-11B-v2.2-Instruct-997:
+    - hash: md5
+      md5: d4a2ab2393a58f0d7e1897859eccb626
+      path: data/experiments/predict/pl-court-instruct/Bielik-11B-v2.2-Instruct/outputs_7312.json
+      size: 1734772
+  predict_pl@pl-court-instruct-Bielik-11B-v2.2-Instruct-997:
     cmd: PYTHONPATH=. python scripts/sft/predict.py dataset=pl-court-instruct model=Bielik-11B-v2.2-Instruct
-      random_seed=997 
-      output_file=data/experiments/predict/pl-court-instruct/Bielik-11B-v2.2-Instruct/outputs_997.json
+      random_seed=997 output_file=data/experiments/predict/pl-court-instruct/Bielik-11B-v2.2-Instruct/outputs_997.json
     deps:
-    - path: configs/model/Bielik-11B-v2.2-Instruct.yaml
-      hash: md5
+    - hash: md5
       md5: 1cfb3fbe30fac3e07a30339e6bf197c9
+      path: configs/model/Bielik-11B-v2.2-Instruct.yaml
       size: 175
-    - path: configs/predict.yaml
-      hash: md5
+    - hash: md5
       md5: 5fc8b9ac571d4a2209d7d866697252ab
+      path: configs/predict.yaml
       size: 402
-    - path: scripts/sft/predict.py
-      hash: md5
+    - hash: md5
       md5: f9acd63cd4d682ae2242d7b51f0d974b
+      path: scripts/sft/predict.py
       size: 3198
     outs:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Bielik-11B-v2.2-Instruct/outputs_997.json
-      hash: md5
+    - hash: md5
       md5: 8f4f6bc97e33b3b2728bebb7620a4968
+      path: data/experiments/predict/pl-court-instruct/Bielik-11B-v2.2-Instruct/outputs_997.json
       size: 1731689
-  evaluate_llm_as_judge@gpt_4o_mini-Unsloth-Llama-3-8B-Instruct-7312:
-    cmd: PYTHONPATH=. python scripts/sft/evaluate_llm_as_judge.py api_model=gpt_4o_mini
-      answers_file=data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct/outputs_7312.json
-      out_metric_file=data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct/judge_metrics_7312.json
-      prompt=pl
+  predict_pl@pl-court-instruct-Bielik-11B-v2.2-Instruct-fine-tuned-42:
+    cmd: PYTHONPATH=. python scripts/sft/predict.py dataset=pl-court-instruct model=Bielik-11B-v2.2-Instruct-fine-tuned
+      random_seed=42 output_file=data/experiments/predict/pl-court-instruct/Bielik-11B-v2.2-Instruct-fine-tuned/outputs_42.json
     deps:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct/outputs_7312.json
-      hash: md5
-      md5: 4c25368aacb7402b1b2cae9368d187d1
-      size: 2013637
-    - path: scripts/sft/evaluate_llm_as_judge.py
-      hash: md5
-      md5: 79a02fb864cb279f93fc4171043bb31c
-      size: 2253
+    - hash: md5
+      md5: b72e852654399c31589e5368e554cbfb
+      path: configs/model/Bielik-11B-v2.2-Instruct-fine-tuned.yaml
+      size: 256
+    - hash: md5
+      md5: 5fc8b9ac571d4a2209d7d866697252ab
+      path: configs/predict.yaml
+      size: 402
+    - hash: md5
+      md5: f9acd63cd4d682ae2242d7b51f0d974b
+      path: scripts/sft/predict.py
+      size: 3198
     outs:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct/judge_metrics_7312.json
-      hash: md5
-      md5: e58171fc082d33c84497a13dabcf766c
-      size: 1167
-  evaluate_llm_as_judge@gpt_4o_mini-Unsloth-Llama-3-8B-Instruct-997:
-    cmd: PYTHONPATH=. python scripts/sft/evaluate_llm_as_judge.py api_model=gpt_4o_mini
-      answers_file=data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct/outputs_997.json
-      out_metric_file=data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct/judge_metrics_997.json
-      prompt=pl
+    - hash: md5
+      md5: dfd5d7389b312686428cc967aea5a5b9
+      path: data/experiments/predict/pl-court-instruct/Bielik-11B-v2.2-Instruct-fine-tuned/outputs_42.json
+      size: 1860743
+  predict_pl@pl-court-instruct-Bielik-11B-v2.2-Instruct-fine-tuned-7312:
+    cmd: PYTHONPATH=. python scripts/sft/predict.py dataset=pl-court-instruct model=Bielik-11B-v2.2-Instruct-fine-tuned
+      random_seed=7312 output_file=data/experiments/predict/pl-court-instruct/Bielik-11B-v2.2-Instruct-fine-tuned/outputs_7312.json
     deps:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct/outputs_997.json
-      hash: md5
-      md5: baef589507248af212aaae51602fd999
-      size: 2010150
-    - path: scripts/sft/evaluate_llm_as_judge.py
-      hash: md5
-      md5: 79a02fb864cb279f93fc4171043bb31c
-      size: 2253
+    - hash: md5
+      md5: b72e852654399c31589e5368e554cbfb
+      path: configs/model/Bielik-11B-v2.2-Instruct-fine-tuned.yaml
+      size: 256
+    - hash: md5
+      md5: 5fc8b9ac571d4a2209d7d866697252ab
+      path: configs/predict.yaml
+      size: 402
+    - hash: md5
+      md5: f9acd63cd4d682ae2242d7b51f0d974b
+      path: scripts/sft/predict.py
+      size: 3198
     outs:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct/judge_metrics_997.json
-      hash: md5
-      md5: f8d16a5298fabe288486822779470cd8
-      size: 1165
-  evaluate_llm_as_judge@gpt_4o_mini-Unsloth-Llama-3-8B-Instruct-fine-tuned-42:
-    cmd: PYTHONPATH=. python scripts/sft/evaluate_llm_as_judge.py api_model=gpt_4o_mini
-      answers_file=data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct-fine-tuned/outputs_42.json
-      out_metric_file=data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct-fine-tuned/judge_metrics_42.json
-    deps:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct-fine-tuned/outputs_42.json
-      hash: md5
-      md5: 289b719e8c7166e578417e5706bdc4e3
-      size: 1760355
-    - path: scripts/sft/evaluate_llm_as_judge.py
-      hash: md5
-      md5: 1e556a79b0f9cf0a9cfdb23ed8077bcc
-      size: 2172
-    outs:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct-fine-tuned/judge_metrics_42.json
-      hash: md5
-      md5: 70398042d030309e7e0bc7ba927136f3
-      size: 1167
-  evaluate_llm_as_judge@gpt_4o_mini-Unsloth-Llama-3-8B-Instruct-fine-tuned-7312:
-    cmd: PYTHONPATH=. python scripts/sft/evaluate_llm_as_judge.py api_model=gpt_4o_mini
-      answers_file=data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct-fine-tuned/outputs_7312.json
-      out_metric_file=data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct-fine-tuned/judge_metrics_7312.json
+    - hash: md5
+      md5: 8fa2faeda5a577c06cd6bf35b8702330
+      path: data/experiments/predict/pl-court-instruct/Bielik-11B-v2.2-Instruct-fine-tuned/outputs_7312.json
+      size: 1857569
+  predict_pl@pl-court-instruct-Bielik-11B-v2.2-Instruct-fine-tuned-997:
+    cmd: PYTHONPATH=. python scripts/sft/predict.py dataset=pl-court-instruct model=Bielik-11B-v2.2-Instruct-fine-tuned
+      random_seed=997 output_file=data/experiments/predict/pl-court-instruct/Bielik-11B-v2.2-Instruct-fine-tuned/outputs_997.json
     deps:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct-fine-tuned/outputs_7312.json
-      hash: md5
-      md5: 25bee3b4ee09b36d636095b4c927a0d3
-      size: 1759194
-    - path: scripts/sft/evaluate_llm_as_judge.py
-      hash: md5
-      md5: 1e556a79b0f9cf0a9cfdb23ed8077bcc
-      size: 2172
-    outs:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct-fine-tuned/judge_metrics_7312.json
-      hash: md5
-      md5: 9d22089c8d23bbc5a028c748e5522c23
-      size: 1157
-  evaluate_llm_as_judge@gpt_4o_mini-Unsloth-Llama-3-8B-Instruct-fine-tuned-997:
-    cmd: PYTHONPATH=. python scripts/sft/evaluate_llm_as_judge.py api_model=gpt_4o_mini
-      answers_file=data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct-fine-tuned/outputs_997.json
-      out_metric_file=data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct-fine-tuned/judge_metrics_997.json
-    deps:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct-fine-tuned/outputs_997.json
-      hash: md5
-      md5: 82b2c535d99d91b9a34986375bfa31a9
-      size: 1758747
-    - path: scripts/sft/evaluate_llm_as_judge.py
-      hash: md5
-      md5: 1e556a79b0f9cf0a9cfdb23ed8077bcc
-      size: 2172
-    outs:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct-fine-tuned/judge_metrics_997.json
-      hash: md5
-      md5: 4222d5b165de8a3a89d71d6519b71b76
-      size: 1170
-  evaluate_llm_as_judge@gpt_4o_mini-Unsloth-Mistral-Nemo-Instruct-2407-42:
-    cmd: PYTHONPATH=. python scripts/sft/evaluate_llm_as_judge.py api_model=gpt_4o_mini
-      answers_file=data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/outputs_42.json
-      out_metric_file=data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/judge_metrics_42.json
-    deps:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/outputs_42.json
-      hash: md5
-      md5: 1385f49966e9db2a88a17f53d0887ad8
-      size: 1741944
-    - path: scripts/sft/evaluate_llm_as_judge.py
-      hash: md5
-      md5: 1e556a79b0f9cf0a9cfdb23ed8077bcc
-      size: 2172
-    outs:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/judge_metrics_42.json
-      hash: md5
-      md5: f4bac633a65afde9bf5612f35c3089bb
-      size: 1170
-  evaluate_llm_as_judge@gpt_4o_mini-Unsloth-Mistral-Nemo-Instruct-2407-7312:
-    cmd: PYTHONPATH=. python scripts/sft/evaluate_llm_as_judge.py api_model=gpt_4o_mini
-      answers_file=data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/outputs_7312.json
-      out_metric_file=data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/judge_metrics_7312.json
-    deps:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/outputs_7312.json
-      hash: md5
-      md5: 924744efce1483e9128579cad7a4454c
-      size: 1748772
-    - path: scripts/sft/evaluate_llm_as_judge.py
-      hash: md5
-      md5: 1e556a79b0f9cf0a9cfdb23ed8077bcc
-      size: 2172
-    outs:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/judge_metrics_7312.json
-      hash: md5
-      md5: 1f95777ef87a547fa7a41dc597adfc39
-      size: 1166
-  evaluate_llm_as_judge@gpt_4o_mini-Unsloth-Mistral-Nemo-Instruct-2407-997:
-    cmd: PYTHONPATH=. python scripts/sft/evaluate_llm_as_judge.py api_model=gpt_4o_mini
-      answers_file=data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/outputs_997.json
-      out_metric_file=data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/judge_metrics_997.json
-    deps:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/outputs_997.json
-      hash: md5
-      md5: 4d023797a9053fd7df61f6b1796112e9
-      size: 1747404
-    - path: scripts/sft/evaluate_llm_as_judge.py
-      hash: md5
-      md5: 1e556a79b0f9cf0a9cfdb23ed8077bcc
-      size: 2172
-    outs:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/judge_metrics_997.json
-      hash: md5
-      md5: de3f557dfdf3440262e4d8f811e526ca
-      size: 1167
-  evaluate_llm_as_judge@gpt_4o_mini-Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned-42:
-    cmd: PYTHONPATH=. python scripts/sft/evaluate_llm_as_judge.py api_model=gpt_4o_mini
-      answers_file=data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned/outputs_42.json
-      out_metric_file=data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned/judge_metrics_42.json
-    deps:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned/outputs_42.json
-      hash: md5
-      md5: 14d4613f7d9495f5fb5f2d7b81f402a9
-      size: 1825646
-    - path: scripts/sft/evaluate_llm_as_judge.py
-      hash: md5
-      md5: 1e556a79b0f9cf0a9cfdb23ed8077bcc
-      size: 2172
-    outs:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned/judge_metrics_42.json
-      hash: md5
-      md5: e8cff190991ee3164825dbf7eca03d12
-      size: 1170
-  evaluate_llm_as_judge@gpt_4o_mini-Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned-7312:
-    cmd: PYTHONPATH=. python scripts/sft/evaluate_llm_as_judge.py api_model=gpt_4o_mini
-      answers_file=data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned/outputs_7312.json
-      out_metric_file=data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned/judge_metrics_7312.json
-    deps:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned/outputs_7312.json
-      hash: md5
-      md5: 302e1dc4f064007e3df88ac1e8acccc5
-      size: 1831330
-    - path: scripts/sft/evaluate_llm_as_judge.py
-      hash: md5
-      md5: 1e556a79b0f9cf0a9cfdb23ed8077bcc
-      size: 2172
-    outs:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned/judge_metrics_7312.json
-      hash: md5
-      md5: aee4a08e0a4d0398b34a2587c039244d
-      size: 1169
-  evaluate_llm_as_judge@gpt_4o_mini-Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned-997:
-    cmd: PYTHONPATH=. python scripts/sft/evaluate_llm_as_judge.py api_model=gpt_4o_mini
-      answers_file=data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned/outputs_997.json
-      out_metric_file=data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned/judge_metrics_997.json
-    deps:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned/outputs_997.json
-      hash: md5
-      md5: 41a47dc56efc29b6c2771db68bdacb17
-      size: 1822491
-    - path: scripts/sft/evaluate_llm_as_judge.py
-      hash: md5
-      md5: 1e556a79b0f9cf0a9cfdb23ed8077bcc
-      size: 2172
-    outs:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned/judge_metrics_997.json
-      hash: md5
-      md5: aac703269b10c85d1a2b5303c22ca077
-      size: 1168
-  evaluate_llm_as_judge@gpt_4o_mini-Bielik-7B-Instruct-v0.1-42:
-    cmd: PYTHONPATH=. python scripts/sft/evaluate_llm_as_judge.py api_model=gpt_4o_mini
-      answers_file=data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1/outputs_42.json
-      out_metric_file=data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1/judge_metrics_42.json
+    - hash: md5
+      md5: b72e852654399c31589e5368e554cbfb
+      path: configs/model/Bielik-11B-v2.2-Instruct-fine-tuned.yaml
+      size: 256
+    - hash: md5
+      md5: 5fc8b9ac571d4a2209d7d866697252ab
+      path: configs/predict.yaml
+      size: 402
+    - hash: md5
+      md5: f9acd63cd4d682ae2242d7b51f0d974b
+      path: scripts/sft/predict.py
+      size: 3198
+    outs:
+    - hash: md5
+      md5: ba53d76f701eddb60a182de49d992878
+      path: data/experiments/predict/pl-court-instruct/Bielik-11B-v2.2-Instruct-fine-tuned/outputs_997.json
+      size: 1857855
+  predict_pl@pl-court-instruct-Bielik-7B-Instruct-v0.1-42:
+    cmd: PYTHONPATH=. python scripts/sft/predict.py dataset=pl-court-instruct model=Bielik-7B-Instruct-v0.1
+      random_seed=42 output_file=data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1/outputs_42.json
     deps:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1/outputs_42.json
-      hash: md5
+    - hash: md5
+      md5: c3412525e9819b53fbad06363a07a871
+      path: configs/model/Bielik-7B-Instruct-v0.1.yaml
+      size: 173
+    - hash: md5
+      md5: 5fc8b9ac571d4a2209d7d866697252ab
+      path: configs/predict.yaml
+      size: 402
+    - hash: md5
+      md5: f9acd63cd4d682ae2242d7b51f0d974b
+      path: scripts/sft/predict.py
+      size: 3198
+    outs:
+    - hash: md5
       md5: 2dc39513a04910c5d0c54380166639d9
+      path: data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1/outputs_42.json
       size: 2029644
-    - path: scripts/sft/evaluate_llm_as_judge.py
-      hash: md5
-      md5: 1e556a79b0f9cf0a9cfdb23ed8077bcc
-      size: 2172
-    outs:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1/judge_metrics_42.json
-      hash: md5
-      md5: 243da4df07c6dfb5199b925e3f5c07aa
-      size: 1137
-  evaluate_llm_as_judge@gpt_4o_mini-Bielik-7B-Instruct-v0.1-7312:
-    cmd: PYTHONPATH=. python scripts/sft/evaluate_llm_as_judge.py api_model=gpt_4o_mini
-      answers_file=data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1/outputs_7312.json
-      out_metric_file=data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1/judge_metrics_7312.json
+  predict_pl@pl-court-instruct-Bielik-7B-Instruct-v0.1-7312:
+    cmd: PYTHONPATH=. python scripts/sft/predict.py dataset=pl-court-instruct model=Bielik-7B-Instruct-v0.1
+      random_seed=7312 output_file=data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1/outputs_7312.json
     deps:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1/outputs_7312.json
-      hash: md5
+    - hash: md5
+      md5: c3412525e9819b53fbad06363a07a871
+      path: configs/model/Bielik-7B-Instruct-v0.1.yaml
+      size: 173
+    - hash: md5
+      md5: 5fc8b9ac571d4a2209d7d866697252ab
+      path: configs/predict.yaml
+      size: 402
+    - hash: md5
+      md5: f9acd63cd4d682ae2242d7b51f0d974b
+      path: scripts/sft/predict.py
+      size: 3198
+    outs:
+    - hash: md5
       md5: ae39bf31296ffe82c0f6a3e8c9ff63aa
+      path: data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1/outputs_7312.json
       size: 2014399
-    - path: scripts/sft/evaluate_llm_as_judge.py
-      hash: md5
-      md5: 1e556a79b0f9cf0a9cfdb23ed8077bcc
-      size: 2172
-    outs:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1/judge_metrics_7312.json
-      hash: md5
-      md5: 8098cc937d57455ca47d32c3449159a3
-      size: 1129
-  evaluate_llm_as_judge@gpt_4o_mini-Bielik-7B-Instruct-v0.1-997:
-    cmd: PYTHONPATH=. python scripts/sft/evaluate_llm_as_judge.py api_model=gpt_4o_mini
-      answers_file=data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1/outputs_997.json
-      out_metric_file=data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1/judge_metrics_997.json
+  predict_pl@pl-court-instruct-Bielik-7B-Instruct-v0.1-997:
+    cmd: PYTHONPATH=. python scripts/sft/predict.py dataset=pl-court-instruct model=Bielik-7B-Instruct-v0.1
+      random_seed=997 output_file=data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1/outputs_997.json
     deps:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1/outputs_997.json
-      hash: md5
+    - hash: md5
+      md5: c3412525e9819b53fbad06363a07a871
+      path: configs/model/Bielik-7B-Instruct-v0.1.yaml
+      size: 173
+    - hash: md5
+      md5: 5fc8b9ac571d4a2209d7d866697252ab
+      path: configs/predict.yaml
+      size: 402
+    - hash: md5
+      md5: f9acd63cd4d682ae2242d7b51f0d974b
+      path: scripts/sft/predict.py
+      size: 3198
+    outs:
+    - hash: md5
       md5: fac04d78ad020b50f79fc7277a037e8e
+      path: data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1/outputs_997.json
       size: 2016400
-    - path: scripts/sft/evaluate_llm_as_judge.py
-      hash: md5
-      md5: 1e556a79b0f9cf0a9cfdb23ed8077bcc
-      size: 2172
-    outs:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1/judge_metrics_997.json
-      hash: md5
-      md5: f1390b2d50893a17c90fc277dc363d6a
-      size: 1139
-  evaluate_llm_as_judge@gpt_4o_mini-Bielik-7B-Instruct-v0.1-fine-tuned-42:
-    cmd: PYTHONPATH=. python scripts/sft/evaluate_llm_as_judge.py api_model=gpt_4o_mini
-      answers_file=data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1-fine-tuned/outputs_42.json
-      out_metric_file=data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1-fine-tuned/judge_metrics_42.json
+  predict_pl@pl-court-instruct-Bielik-7B-Instruct-v0.1-fine-tuned-42:
+    cmd: PYTHONPATH=. python scripts/sft/predict.py dataset=pl-court-instruct model=Bielik-7B-Instruct-v0.1-fine-tuned
+      random_seed=42 output_file=data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1-fine-tuned/outputs_42.json
     deps:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1-fine-tuned/outputs_42.json
-      hash: md5
+    - hash: md5
+      md5: 2d9590869dfe247d7c6335f3cd7dd0c2
+      path: configs/model/Bielik-7B-Instruct-v0.1-fine-tuned.yaml
+      size: 253
+    - hash: md5
+      md5: 5fc8b9ac571d4a2209d7d866697252ab
+      path: configs/predict.yaml
+      size: 402
+    - hash: md5
+      md5: f9acd63cd4d682ae2242d7b51f0d974b
+      path: scripts/sft/predict.py
+      size: 3198
+    outs:
+    - hash: md5
       md5: 178eb0649617d4a698da6c9e315e84c5
+      path: data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1-fine-tuned/outputs_42.json
       size: 2034749
-    - path: scripts/sft/evaluate_llm_as_judge.py
-      hash: md5
-      md5: 1e556a79b0f9cf0a9cfdb23ed8077bcc
-      size: 2172
-    outs:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1-fine-tuned/judge_metrics_42.json
-      hash: md5
-      md5: 302b957707520fa327d1da0edf18baa3
-      size: 1167
-  evaluate_llm_as_judge@gpt_4o_mini-Bielik-7B-Instruct-v0.1-fine-tuned-7312:
-    cmd: PYTHONPATH=. python scripts/sft/evaluate_llm_as_judge.py api_model=gpt_4o_mini
-      answers_file=data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1-fine-tuned/outputs_7312.json
-      out_metric_file=data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1-fine-tuned/judge_metrics_7312.json
-    deps:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1-fine-tuned/outputs_7312.json
-      hash: md5
-      md5: 743ea22448bc73a7a991da075fca8841
-      size: 2031343
-    - path: scripts/sft/evaluate_llm_as_judge.py
-      hash: md5
-      md5: 1e556a79b0f9cf0a9cfdb23ed8077bcc
-      size: 2172
-    outs:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1-fine-tuned/judge_metrics_7312.json
-      hash: md5
-      md5: 789f0906846251d3f0cab78d111f9c56
-      size: 1163
-  evaluate_llm_as_judge@gpt_4o_mini-Bielik-7B-Instruct-v0.1-fine-tuned-997:
-    cmd: PYTHONPATH=. python scripts/sft/evaluate_llm_as_judge.py api_model=gpt_4o_mini
-      answers_file=data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1-fine-tuned/outputs_997.json
-      out_metric_file=data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1-fine-tuned/judge_metrics_997.json
-    deps:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1-fine-tuned/outputs_997.json
-      hash: md5
-      md5: 433a4b2aa7870a134277a265d099a588
-      size: 2029482
-    - path: scripts/sft/evaluate_llm_as_judge.py
-      hash: md5
-      md5: 1e556a79b0f9cf0a9cfdb23ed8077bcc
-      size: 2172
-    outs:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1-fine-tuned/judge_metrics_997.json
-      hash: md5
-      md5: 90f3ed04ef29c5cd29b7ec8f02a780a1
-      size: 1163
-  evaluate_llm_as_judge_en@gpt_4o_mini-Unsloth-Llama-3-8B-Instruct-997:
-    cmd: PYTHONPATH=. python scripts/sft/evaluate_llm_as_judge.py api_model=gpt_4o_mini
-      answers_file=data/experiments/predict/en-court-instruct/Unsloth-Llama-3-8B-Instruct/outputs_997.json
-      out_metric_file=data/experiments/predict/en-court-instruct/Unsloth-Llama-3-8B-Instruct/judge_metrics_997.json
-      prompt=en
-    deps:
-    - path: 
-        data/experiments/predict/en-court-instruct/Unsloth-Llama-3-8B-Instruct/outputs_997.json
-      hash: md5
-      md5: 94924275d576271875fecf22c0f9b39e
-      size: 817490
-    - path: scripts/sft/evaluate_llm_as_judge.py
-      hash: md5
-      md5: 79a02fb864cb279f93fc4171043bb31c
-      size: 2253
-    outs:
-    - path: 
-        data/experiments/predict/en-court-instruct/Unsloth-Llama-3-8B-Instruct/judge_metrics_997.json
-      hash: md5
-      md5: 4395c32931d25a1bd9aa092c5a0e5460
-      size: 478
-  evaluate_llm_as_judge_en@gpt_4o_mini-Unsloth-Llama-3-8B-Instruct-fine-tuned-en-997:
-    cmd: PYTHONPATH=. python scripts/sft/evaluate_llm_as_judge.py api_model=gpt_4o_mini
-      answers_file=data/experiments/predict/en-court-instruct/Unsloth-Llama-3-8B-Instruct-fine-tuned-en/outputs_997.json
-      out_metric_file=data/experiments/predict/en-court-instruct/Unsloth-Llama-3-8B-Instruct-fine-tuned-en/judge_metrics_997.json
-      prompt=en
-    deps:
-    - path: 
-        data/experiments/predict/en-court-instruct/Unsloth-Llama-3-8B-Instruct-fine-tuned-en/outputs_997.json
-      hash: md5
-      md5: 4e968cac351ad48ad786d1ecccbbc967
-      size: 670674
-    - path: scripts/sft/evaluate_llm_as_judge.py
-      hash: md5
-      md5: 79a02fb864cb279f93fc4171043bb31c
-      size: 2253
-    outs:
-    - path: 
-        data/experiments/predict/en-court-instruct/Unsloth-Llama-3-8B-Instruct-fine-tuned-en/judge_metrics_997.json
-      hash: md5
-      md5: 90c2b0cd132130d0b9d3a60bf6fdd69b
-      size: 486
-  evaluate_llm_as_judge_en@gpt_4o_mini-Unsloth-Mistral-Nemo-Instruct-2407-997:
-    cmd: PYTHONPATH=. python scripts/sft/evaluate_llm_as_judge.py api_model=gpt_4o_mini
-      answers_file=data/experiments/predict/en-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/outputs_997.json
-      out_metric_file=data/experiments/predict/en-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/judge_metrics_997.json
-      prompt=en
-    deps:
-    - path: 
-        data/experiments/predict/en-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/outputs_997.json
-      hash: md5
-      md5: 94c30cf8fe7db71afc58a5c9cdbc0d9f
-      size: 705894
-    - path: scripts/sft/evaluate_llm_as_judge.py
-      hash: md5
-      md5: 79a02fb864cb279f93fc4171043bb31c
-      size: 2253
-    outs:
-    - path: 
-        data/experiments/predict/en-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/judge_metrics_997.json
-      hash: md5
-      md5: 860b5c00ace1f2967db9b5a977cfc3ad
-      size: 478
-  evaluate_llm_as_judge_en@gpt_4o_mini-Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned-en-997:
-    cmd: PYTHONPATH=. python scripts/sft/evaluate_llm_as_judge.py api_model=gpt_4o_mini
-      answers_file=data/experiments/predict/en-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned-en/outputs_997.json
-      out_metric_file=data/experiments/predict/en-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned-en/judge_metrics_997.json
-      prompt=en
+  predict_pl@pl-court-instruct-Bielik-7B-Instruct-v0.1-fine-tuned-7312:
+    cmd: PYTHONPATH=. python scripts/sft/predict.py dataset=pl-court-instruct model=Bielik-7B-Instruct-v0.1-fine-tuned
+      random_seed=7312 output_file=data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1-fine-tuned/outputs_7312.json
     deps:
-    - path: 
-        data/experiments/predict/en-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned-en/outputs_997.json
-      hash: md5
-      md5: 787c129090aa1b64e337b236a4391402
-      size: 642477
-    - path: scripts/sft/evaluate_llm_as_judge.py
-      hash: md5
-      md5: 79a02fb864cb279f93fc4171043bb31c
-      size: 2253
-    outs:
-    - path: 
-        data/experiments/predict/en-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned-en/judge_metrics_997.json
-      hash: md5
-      md5: 34de8eabaebe6a96b4b664b664f222e2
-      size: 484
-  summarize_metrics@data/experiments/predict/en-court-instruct:
-    cmd: PYTHONPATH=. python scripts/sft/summarize_metrics.py --root-dir data/experiments/predict/en-court-instruct
+    - hash: md5
+      md5: 2d9590869dfe247d7c6335f3cd7dd0c2
+      path: configs/model/Bielik-7B-Instruct-v0.1-fine-tuned.yaml
+      size: 253
+    - hash: md5
+      md5: 5fc8b9ac571d4a2209d7d866697252ab
+      path: configs/predict.yaml
+      size: 402
+    - hash: md5
+      md5: f9acd63cd4d682ae2242d7b51f0d974b
+      path: scripts/sft/predict.py
+      size: 3198
+    outs:
+    - hash: md5
+      md5: 743ea22448bc73a7a991da075fca8841
+      path: data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1-fine-tuned/outputs_7312.json
+      size: 2031343
+  predict_pl@pl-court-instruct-Bielik-7B-Instruct-v0.1-fine-tuned-997:
+    cmd: PYTHONPATH=. python scripts/sft/predict.py dataset=pl-court-instruct model=Bielik-7B-Instruct-v0.1-fine-tuned
+      random_seed=997 output_file=data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1-fine-tuned/outputs_997.json
     deps:
-    - path: scripts/sft/summarize_metrics.py
-      hash: md5
-      md5: eb5736f5709f9773acf21bfc28c2e012
-      size: 2975
+    - hash: md5
+      md5: 2d9590869dfe247d7c6335f3cd7dd0c2
+      path: configs/model/Bielik-7B-Instruct-v0.1-fine-tuned.yaml
+      size: 253
+    - hash: md5
+      md5: 5fc8b9ac571d4a2209d7d866697252ab
+      path: configs/predict.yaml
+      size: 402
+    - hash: md5
+      md5: f9acd63cd4d682ae2242d7b51f0d974b
+      path: scripts/sft/predict.py
+      size: 3198
     outs:
-    - path: data/experiments/predict/en-court-instruct/metrics_judge_summary.md
-      hash: md5
-      md5: 6065f2fbff28ab7439d35ddfe03b1938
-      size: 4857
-    - path: data/experiments/predict/en-court-instruct/metrics_ngram_summary.md
-      hash: md5
-      md5: 1bb66cbd940bd2288f69fbe490465aaa
-      size: 1031
-  evaluate_api_models@en-court-instruct-open_ai_gpt-4o-997:
-    cmd: PYTHONPATH=. python scripts/sft/evaluate.py --output-file data/experiments/predict/en-court-instruct/open_ai_gpt-4o/outputs_997.json
-      --num-proc=-1
+    - hash: md5
+      md5: 433a4b2aa7870a134277a265d099a588
+      path: data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1-fine-tuned/outputs_997.json
+      size: 2029482
+  predict_pl@pl-court-instruct-Unsloth-Llama-3-8B-Instruct-42:
+    cmd: PYTHONPATH=. python scripts/sft/predict.py dataset=pl-court-instruct model=Unsloth-Llama-3-8B-Instruct
+      random_seed=42 output_file=data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct/outputs_42.json
     deps:
-    - path: data/experiments/predict/en-court-instruct/open_ai_gpt-4o/outputs_997.json
-      hash: md5
-      md5: 8f70e2baa0b0ae8a320577f5c8a60011
-      size: 679432
-    - path: scripts/sft/evaluate.py
-      hash: md5
-      md5: 0644efb76af2c5461185e37a07ba2c17
-      size: 697
+    - hash: md5
+      md5: 56a95874b3e77e7ffec11c00330da5b6
+      path: configs/model/Unsloth-Llama-3-8B-Instruct.yaml
+      size: 176
+    - hash: md5
+      md5: 5fc8b9ac571d4a2209d7d866697252ab
+      path: configs/predict.yaml
+      size: 402
+    - hash: md5
+      md5: f9acd63cd4d682ae2242d7b51f0d974b
+      path: scripts/sft/predict.py
+      size: 3198
     outs:
-    - path: data/experiments/predict/en-court-instruct/open_ai_gpt-4o/metrics_997.json
-      hash: md5
-      md5: ac30bcf3c40000cab61e0914b56aba85
-      size: 157
-  evaluate_api_models@en-court-instruct-open_ai_gpt-4o-mini-997:
-    cmd: PYTHONPATH=. python scripts/sft/evaluate.py --output-file data/experiments/predict/en-court-instruct/open_ai_gpt-4o-mini/outputs_997.json
-      --num-proc=-1
+    - hash: md5
+      md5: e99c88720116c951087b6125e5f4be4d
+      path: data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct/outputs_42.json
+      size: 2008073
+  predict_pl@pl-court-instruct-Unsloth-Llama-3-8B-Instruct-7312:
+    cmd: PYTHONPATH=. python scripts/sft/predict.py dataset=pl-court-instruct model=Unsloth-Llama-3-8B-Instruct
+      random_seed=7312 output_file=data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct/outputs_7312.json
     deps:
-    - path: data/experiments/predict/en-court-instruct/open_ai_gpt-4o-mini/outputs_997.json
-      hash: md5
-      md5: 2a0819011b3eac56e497201a9f67e310
-      size: 690306
-    - path: scripts/sft/evaluate.py
-      hash: md5
-      md5: 0644efb76af2c5461185e37a07ba2c17
-      size: 697
+    - hash: md5
+      md5: 56a95874b3e77e7ffec11c00330da5b6
+      path: configs/model/Unsloth-Llama-3-8B-Instruct.yaml
+      size: 176
+    - hash: md5
+      md5: 5fc8b9ac571d4a2209d7d866697252ab
+      path: configs/predict.yaml
+      size: 402
+    - hash: md5
+      md5: f9acd63cd4d682ae2242d7b51f0d974b
+      path: scripts/sft/predict.py
+      size: 3198
     outs:
-    - path: data/experiments/predict/en-court-instruct/open_ai_gpt-4o-mini/metrics_997.json
-      hash: md5
-      md5: d70eb0821aff9c9e874a421b80f7f697
-      size: 155
-  evaluate_api_models@pl-court-instruct-open_ai_gpt-4o-997:
-    cmd: PYTHONPATH=. python scripts/sft/evaluate.py --output-file data/experiments/predict/pl-court-instruct/open_ai_gpt-4o/outputs_997.json
-      --num-proc=-1
+    - hash: md5
+      md5: 4c25368aacb7402b1b2cae9368d187d1
+      path: data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct/outputs_7312.json
+      size: 2013637
+  predict_pl@pl-court-instruct-Unsloth-Llama-3-8B-Instruct-997:
+    cmd: PYTHONPATH=. python scripts/sft/predict.py dataset=pl-court-instruct model=Unsloth-Llama-3-8B-Instruct
+      random_seed=997 output_file=data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct/outputs_997.json
     deps:
-    - path: data/experiments/predict/pl-court-instruct/open_ai_gpt-4o/outputs_997.json
-      hash: md5
-      md5: 7c5833fdd1419163b286baaa3d71e084
-      size: 1965252
-    - path: scripts/sft/evaluate.py
-      hash: md5
-      md5: 0644efb76af2c5461185e37a07ba2c17
-      size: 697
+    - hash: md5
+      md5: 56a95874b3e77e7ffec11c00330da5b6
+      path: configs/model/Unsloth-Llama-3-8B-Instruct.yaml
+      size: 176
+    - hash: md5
+      md5: 5fc8b9ac571d4a2209d7d866697252ab
+      path: configs/predict.yaml
+      size: 402
+    - hash: md5
+      md5: f9acd63cd4d682ae2242d7b51f0d974b
+      path: scripts/sft/predict.py
+      size: 3198
     outs:
-    - path: data/experiments/predict/pl-court-instruct/open_ai_gpt-4o/metrics_997.json
-      hash: md5
-      md5: 65c808d4aebd8efe37b94a5128a19de6
-      size: 306
-  evaluate_api_models@pl-court-instruct-open_ai_gpt-4o-mini-997:
-    cmd: PYTHONPATH=. python scripts/sft/evaluate.py --output-file data/experiments/predict/pl-court-instruct/open_ai_gpt-4o-mini/outputs_997.json
-      --num-proc=-1
+    - hash: md5
+      md5: baef589507248af212aaae51602fd999
+      path: data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct/outputs_997.json
+      size: 2010150
+  predict_pl@pl-court-instruct-Unsloth-Llama-3-8B-Instruct-fine-tuned-42:
+    cmd: PYTHONPATH=. python scripts/sft/predict.py dataset=pl-court-instruct model=Unsloth-Llama-3-8B-Instruct-fine-tuned
+      random_seed=42 output_file=data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct-fine-tuned/outputs_42.json
     deps:
-    - path: data/experiments/predict/pl-court-instruct/open_ai_gpt-4o-mini/outputs_997.json
-      hash: md5
-      md5: 839c911f542cd7c60c9ae52ef95e9907
-      size: 1812429
-    - path: scripts/sft/evaluate.py
-      hash: md5
-      md5: 0644efb76af2c5461185e37a07ba2c17
-      size: 697
+    - hash: md5
+      md5: 3906c39a5c516f89ddafb7eff21615cd
+      path: configs/model/Unsloth-Llama-3-8B-Instruct-fine-tuned.yaml
+      size: 275
+    - hash: md5
+      md5: 5fc8b9ac571d4a2209d7d866697252ab
+      path: configs/predict.yaml
+      size: 402
+    - hash: md5
+      md5: f9acd63cd4d682ae2242d7b51f0d974b
+      path: scripts/sft/predict.py
+      size: 3198
     outs:
-    - path: data/experiments/predict/pl-court-instruct/open_ai_gpt-4o-mini/metrics_997.json
-      hash: md5
-      md5: fe43f0d25b500a0f2fb2d8199b8034fd
-      size: 305
-  evaluate_llm_as_judge_en@gpt_4o_mini-Unsloth-Llama-3-8B-Instruct-42:
-    cmd: PYTHONPATH=. python scripts/sft/evaluate_llm_as_judge.py api_model=gpt_4o_mini
-      answers_file=data/experiments/predict/en-court-instruct/Unsloth-Llama-3-8B-Instruct/outputs_42.json
-      out_metric_file=data/experiments/predict/en-court-instruct/Unsloth-Llama-3-8B-Instruct/judge_metrics_42.json
-      prompt=en
+    - hash: md5
+      md5: 289b719e8c7166e578417e5706bdc4e3
+      path: data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct-fine-tuned/outputs_42.json
+      size: 1760355
+  predict_pl@pl-court-instruct-Unsloth-Llama-3-8B-Instruct-fine-tuned-7312:
+    cmd: PYTHONPATH=. python scripts/sft/predict.py dataset=pl-court-instruct model=Unsloth-Llama-3-8B-Instruct-fine-tuned
+      random_seed=7312 output_file=data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct-fine-tuned/outputs_7312.json
     deps:
-    - path: 
-        data/experiments/predict/en-court-instruct/Unsloth-Llama-3-8B-Instruct/outputs_42.json
-      hash: md5
-      md5: 761018c0a306fbee63dad2fbc119110d
-      size: 821683
-    - path: scripts/sft/evaluate_llm_as_judge.py
-      hash: md5
-      md5: 79a02fb864cb279f93fc4171043bb31c
-      size: 2253
+    - hash: md5
+      md5: 3906c39a5c516f89ddafb7eff21615cd
+      path: configs/model/Unsloth-Llama-3-8B-Instruct-fine-tuned.yaml
+      size: 275
+    - hash: md5
+      md5: 5fc8b9ac571d4a2209d7d866697252ab
+      path: configs/predict.yaml
+      size: 402
+    - hash: md5
+      md5: f9acd63cd4d682ae2242d7b51f0d974b
+      path: scripts/sft/predict.py
+      size: 3198
     outs:
-    - path: 
-        data/experiments/predict/en-court-instruct/Unsloth-Llama-3-8B-Instruct/judge_metrics_42.json
-      hash: md5
-      md5: 77ecbff8c82afbfd6fec098fb87e1218
-      size: 478
-  evaluate_llm_as_judge_en@gpt_4o_mini-Unsloth-Llama-3-8B-Instruct-7312:
-    cmd: PYTHONPATH=. python scripts/sft/evaluate_llm_as_judge.py api_model=gpt_4o_mini
-      answers_file=data/experiments/predict/en-court-instruct/Unsloth-Llama-3-8B-Instruct/outputs_7312.json
-      out_metric_file=data/experiments/predict/en-court-instruct/Unsloth-Llama-3-8B-Instruct/judge_metrics_7312.json
-      prompt=en
+    - hash: md5
+      md5: 25bee3b4ee09b36d636095b4c927a0d3
+      path: data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct-fine-tuned/outputs_7312.json
+      size: 1759194
+  predict_pl@pl-court-instruct-Unsloth-Llama-3-8B-Instruct-fine-tuned-997:
+    cmd: PYTHONPATH=. python scripts/sft/predict.py dataset=pl-court-instruct model=Unsloth-Llama-3-8B-Instruct-fine-tuned
+      random_seed=997 output_file=data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct-fine-tuned/outputs_997.json
     deps:
-    - path: 
-        data/experiments/predict/en-court-instruct/Unsloth-Llama-3-8B-Instruct/outputs_7312.json
-      hash: md5
-      md5: a7361535b440251d6ce6232a15cfcdf2
-      size: 818877
-    - path: scripts/sft/evaluate_llm_as_judge.py
-      hash: md5
-      md5: 79a02fb864cb279f93fc4171043bb31c
-      size: 2253
+    - hash: md5
+      md5: 3906c39a5c516f89ddafb7eff21615cd
+      path: configs/model/Unsloth-Llama-3-8B-Instruct-fine-tuned.yaml
+      size: 275
+    - hash: md5
+      md5: 5fc8b9ac571d4a2209d7d866697252ab
+      path: configs/predict.yaml
+      size: 402
+    - hash: md5
+      md5: f9acd63cd4d682ae2242d7b51f0d974b
+      path: scripts/sft/predict.py
+      size: 3198
     outs:
-    - path: 
-        data/experiments/predict/en-court-instruct/Unsloth-Llama-3-8B-Instruct/judge_metrics_7312.json
-      hash: md5
-      md5: f25c9ad98ef817e976def98d6b7d3b5d
-      size: 482
-  evaluate_llm_as_judge_en@gpt_4o_mini-Unsloth-Llama-3-8B-Instruct-fine-tuned-en-42:
-    cmd: PYTHONPATH=. python scripts/sft/evaluate_llm_as_judge.py api_model=gpt_4o_mini
-      answers_file=data/experiments/predict/en-court-instruct/Unsloth-Llama-3-8B-Instruct-fine-tuned-en/outputs_42.json
-      out_metric_file=data/experiments/predict/en-court-instruct/Unsloth-Llama-3-8B-Instruct-fine-tuned-en/judge_metrics_42.json
-      prompt=en
+    - hash: md5
+      md5: 82b2c535d99d91b9a34986375bfa31a9
+      path: data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct-fine-tuned/outputs_997.json
+      size: 1758747
+  predict_pl@pl-court-instruct-Unsloth-Mistral-7B-Instruct-v0.3-42:
+    cmd: PYTHONPATH=. python scripts/sft/predict.py dataset=pl-court-instruct model=Unsloth-Mistral-7B-Instruct-v0.3
+      random_seed=42 output_file=data/experiments/predict/pl-court-instruct/Unsloth-Mistral-7B-Instruct-v0.3/outputs_42.json
     deps:
-    - path: 
-        data/experiments/predict/en-court-instruct/Unsloth-Llama-3-8B-Instruct-fine-tuned-en/outputs_42.json
-      hash: md5
-      md5: 4246a4fafba5e130aac3db6c1c61ce30
-      size: 675578
-    - path: scripts/sft/evaluate_llm_as_judge.py
-      hash: md5
-      md5: 79a02fb864cb279f93fc4171043bb31c
-      size: 2253
+    - hash: md5
+      md5: d184e20107315876e7751bdc7c3841ad
+      path: configs/model/Unsloth-Mistral-7B-Instruct-v0.3.yaml
+      size: 182
+    - hash: md5
+      md5: 5fc8b9ac571d4a2209d7d866697252ab
+      path: configs/predict.yaml
+      size: 402
+    - hash: md5
+      md5: f9acd63cd4d682ae2242d7b51f0d974b
+      path: scripts/sft/predict.py
+      size: 3198
     outs:
-    - path: 
-        data/experiments/predict/en-court-instruct/Unsloth-Llama-3-8B-Instruct-fine-tuned-en/judge_metrics_42.json
-      hash: md5
-      md5: 5f2cea81c873a3b85ef95ba9a6dc90a5
-      size: 487
-  evaluate_llm_as_judge_en@gpt_4o_mini-Unsloth-Llama-3-8B-Instruct-fine-tuned-en-7312:
-    cmd: PYTHONPATH=. python scripts/sft/evaluate_llm_as_judge.py api_model=gpt_4o_mini
-      answers_file=data/experiments/predict/en-court-instruct/Unsloth-Llama-3-8B-Instruct-fine-tuned-en/outputs_7312.json
-      out_metric_file=data/experiments/predict/en-court-instruct/Unsloth-Llama-3-8B-Instruct-fine-tuned-en/judge_metrics_7312.json
-      prompt=en
+    - hash: md5
+      md5: 4a9d3a2bb1dd47a732bd2df8102bc93f
+      path: data/experiments/predict/pl-court-instruct/Unsloth-Mistral-7B-Instruct-v0.3/outputs_42.json
+      size: 1799957
+  predict_pl@pl-court-instruct-Unsloth-Mistral-7B-Instruct-v0.3-7312:
+    cmd: PYTHONPATH=. python scripts/sft/predict.py dataset=pl-court-instruct model=Unsloth-Mistral-7B-Instruct-v0.3
+      random_seed=7312 output_file=data/experiments/predict/pl-court-instruct/Unsloth-Mistral-7B-Instruct-v0.3/outputs_7312.json
     deps:
-    - path: 
-        data/experiments/predict/en-court-instruct/Unsloth-Llama-3-8B-Instruct-fine-tuned-en/outputs_7312.json
-      hash: md5
-      md5: f0b806eebca2f3ddf49d0ff821856b45
-      size: 670935
-    - path: scripts/sft/evaluate_llm_as_judge.py
-      hash: md5
-      md5: 79a02fb864cb279f93fc4171043bb31c
-      size: 2253
+    - hash: md5
+      md5: d184e20107315876e7751bdc7c3841ad
+      path: configs/model/Unsloth-Mistral-7B-Instruct-v0.3.yaml
+      size: 182
+    - hash: md5
+      md5: 5fc8b9ac571d4a2209d7d866697252ab
+      path: configs/predict.yaml
+      size: 402
+    - hash: md5
+      md5: f9acd63cd4d682ae2242d7b51f0d974b
+      path: scripts/sft/predict.py
+      size: 3198
     outs:
-    - path: 
-        data/experiments/predict/en-court-instruct/Unsloth-Llama-3-8B-Instruct-fine-tuned-en/judge_metrics_7312.json
-      hash: md5
-      md5: 5cc45cac8a7607e42a8a394593d33396
-      size: 486
-  evaluate_llm_as_judge_en@gpt_4o_mini-Unsloth-Mistral-Nemo-Instruct-2407-42:
-    cmd: PYTHONPATH=. python scripts/sft/evaluate_llm_as_judge.py api_model=gpt_4o_mini
-      answers_file=data/experiments/predict/en-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/outputs_42.json
-      out_metric_file=data/experiments/predict/en-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/judge_metrics_42.json
-      prompt=en
+    - hash: md5
+      md5: 77e10dd2ec17e12e171e4bcab1a48e08
+      path: data/experiments/predict/pl-court-instruct/Unsloth-Mistral-7B-Instruct-v0.3/outputs_7312.json
+      size: 1795629
+  predict_pl@pl-court-instruct-Unsloth-Mistral-7B-Instruct-v0.3-997:
+    cmd: PYTHONPATH=. python scripts/sft/predict.py dataset=pl-court-instruct model=Unsloth-Mistral-7B-Instruct-v0.3
+      random_seed=997 output_file=data/experiments/predict/pl-court-instruct/Unsloth-Mistral-7B-Instruct-v0.3/outputs_997.json
     deps:
-    - path: 
-        data/experiments/predict/en-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/outputs_42.json
-      hash: md5
-      md5: 4fe25ad80a20ea5d6200136176b3e4ca
-      size: 705218
-    - path: scripts/sft/evaluate_llm_as_judge.py
-      hash: md5
-      md5: 79a02fb864cb279f93fc4171043bb31c
-      size: 2253
+    - hash: md5
+      md5: d184e20107315876e7751bdc7c3841ad
+      path: configs/model/Unsloth-Mistral-7B-Instruct-v0.3.yaml
+      size: 182
+    - hash: md5
+      md5: 5fc8b9ac571d4a2209d7d866697252ab
+      path: configs/predict.yaml
+      size: 402
+    - hash: md5
+      md5: f9acd63cd4d682ae2242d7b51f0d974b
+      path: scripts/sft/predict.py
+      size: 3198
     outs:
-    - path: 
-        data/experiments/predict/en-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/judge_metrics_42.json
-      hash: md5
-      md5: 69901f631da4ffefd09e7cbfac39cd89
-      size: 480
-  evaluate_llm_as_judge_en@gpt_4o_mini-Unsloth-Mistral-Nemo-Instruct-2407-7312:
-    cmd: PYTHONPATH=. python scripts/sft/evaluate_llm_as_judge.py api_model=gpt_4o_mini
-      answers_file=data/experiments/predict/en-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/outputs_7312.json
-      out_metric_file=data/experiments/predict/en-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/judge_metrics_7312.json
-      prompt=en
+    - hash: md5
+      md5: bbb883aa388b274bef3e9296df26f68f
+      path: data/experiments/predict/pl-court-instruct/Unsloth-Mistral-7B-Instruct-v0.3/outputs_997.json
+      size: 1795752
+  predict_pl@pl-court-instruct-Unsloth-Mistral-7B-Instruct-v0.3-fine-tuned-42:
+    cmd: PYTHONPATH=. python scripts/sft/predict.py dataset=pl-court-instruct model=Unsloth-Mistral-7B-Instruct-v0.3-fine-tuned
+      random_seed=42 output_file=data/experiments/predict/pl-court-instruct/Unsloth-Mistral-7B-Instruct-v0.3-fine-tuned/outputs_42.json
     deps:
-    - path: 
-        data/experiments/predict/en-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/outputs_7312.json
-      hash: md5
-      md5: cf4fdbf0e26e6c793bdca4edd6e365c0
-      size: 703876
-    - path: scripts/sft/evaluate_llm_as_judge.py
-      hash: md5
-      md5: 79a02fb864cb279f93fc4171043bb31c
-      size: 2253
+    - hash: md5
+      md5: 828382dc03dbed80cff4a3358321dc4a
+      path: configs/model/Unsloth-Mistral-7B-Instruct-v0.3-fine-tuned.yaml
+      size: 271
+    - hash: md5
+      md5: 5fc8b9ac571d4a2209d7d866697252ab
+      path: configs/predict.yaml
+      size: 402
+    - hash: md5
+      md5: f9acd63cd4d682ae2242d7b51f0d974b
+      path: scripts/sft/predict.py
+      size: 3198
     outs:
-    - path: 
-        data/experiments/predict/en-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/judge_metrics_7312.json
-      hash: md5
-      md5: 860b5c00ace1f2967db9b5a977cfc3ad
-      size: 478
-  evaluate_llm_as_judge_en@gpt_4o_mini-Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned-en-42:
-    cmd: PYTHONPATH=. python scripts/sft/evaluate_llm_as_judge.py api_model=gpt_4o_mini
-      answers_file=data/experiments/predict/en-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned-en/outputs_42.json
-      out_metric_file=data/experiments/predict/en-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned-en/judge_metrics_42.json
-      prompt=en
+    - hash: md5
+      md5: 156091297490d893f9815d2ffcf17cbf
+      path: data/experiments/predict/pl-court-instruct/Unsloth-Mistral-7B-Instruct-v0.3-fine-tuned/outputs_42.json
+      size: 1792160
+  predict_pl@pl-court-instruct-Unsloth-Mistral-7B-Instruct-v0.3-fine-tuned-7312:
+    cmd: PYTHONPATH=. python scripts/sft/predict.py dataset=pl-court-instruct model=Unsloth-Mistral-7B-Instruct-v0.3-fine-tuned
+      random_seed=7312 output_file=data/experiments/predict/pl-court-instruct/Unsloth-Mistral-7B-Instruct-v0.3-fine-tuned/outputs_7312.json
     deps:
-    - path: 
-        data/experiments/predict/en-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned-en/outputs_42.json
-      hash: md5
-      md5: 313fa5a662f37cacae4980a04830f422
-      size: 642688
-    - path: scripts/sft/evaluate_llm_as_judge.py
-      hash: md5
-      md5: 79a02fb864cb279f93fc4171043bb31c
-      size: 2253
+    - hash: md5
+      md5: 828382dc03dbed80cff4a3358321dc4a
+      path: configs/model/Unsloth-Mistral-7B-Instruct-v0.3-fine-tuned.yaml
+      size: 271
+    - hash: md5
+      md5: 5fc8b9ac571d4a2209d7d866697252ab
+      path: configs/predict.yaml
+      size: 402
+    - hash: md5
+      md5: f9acd63cd4d682ae2242d7b51f0d974b
+      path: scripts/sft/predict.py
+      size: 3198
     outs:
-    - path: 
-        data/experiments/predict/en-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned-en/judge_metrics_42.json
-      hash: md5
-      md5: 974e972a09d844a77840029d642e8077
-      size: 486
-  evaluate_llm_as_judge_en@gpt_4o_mini-Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned-en-7312:
-    cmd: PYTHONPATH=. python scripts/sft/evaluate_llm_as_judge.py api_model=gpt_4o_mini
-      answers_file=data/experiments/predict/en-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned-en/outputs_7312.json
-      out_metric_file=data/experiments/predict/en-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned-en/judge_metrics_7312.json
-      prompt=en
+    - hash: md5
+      md5: 3d336675e54a706fae45349adbaf6ee4
+      path: data/experiments/predict/pl-court-instruct/Unsloth-Mistral-7B-Instruct-v0.3-fine-tuned/outputs_7312.json
+      size: 1793461
+  predict_pl@pl-court-instruct-Unsloth-Mistral-7B-Instruct-v0.3-fine-tuned-997:
+    cmd: PYTHONPATH=. python scripts/sft/predict.py dataset=pl-court-instruct model=Unsloth-Mistral-7B-Instruct-v0.3-fine-tuned
+      random_seed=997 output_file=data/experiments/predict/pl-court-instruct/Unsloth-Mistral-7B-Instruct-v0.3-fine-tuned/outputs_997.json
     deps:
-    - path: 
-        data/experiments/predict/en-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned-en/outputs_7312.json
-      hash: md5
-      md5: 4ed8db93aa14f1cc98e276d3989efa9e
-      size: 642730
-    - path: scripts/sft/evaluate_llm_as_judge.py
-      hash: md5
-      md5: 79a02fb864cb279f93fc4171043bb31c
-      size: 2253
+    - hash: md5
+      md5: 828382dc03dbed80cff4a3358321dc4a
+      path: configs/model/Unsloth-Mistral-7B-Instruct-v0.3-fine-tuned.yaml
+      size: 271
+    - hash: md5
+      md5: 5fc8b9ac571d4a2209d7d866697252ab
+      path: configs/predict.yaml
+      size: 402
+    - hash: md5
+      md5: f9acd63cd4d682ae2242d7b51f0d974b
+      path: scripts/sft/predict.py
+      size: 3198
     outs:
-    - path: 
-        data/experiments/predict/en-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned-en/judge_metrics_7312.json
-      hash: md5
-      md5: 8a9712eb10a8da99d86bab8968fd3207
-      size: 485
-  evaluate_llm_as_judge_api_models@pl-court-instruct-gpt_4o_mini-open_ai_gpt-4o-997:
-    cmd: PYTHONPATH=. python scripts/sft/evaluate_llm_as_judge.py api_model=gpt_4o_mini
-      answers_file=data/experiments/predict/pl-court-instruct/open_ai_gpt-4o/outputs_997.json
-      out_metric_file=data/experiments/predict/pl-court-instruct/open_ai_gpt-4o/judge_metrics_997.json
+    - hash: md5
+      md5: 55d682fba1c08c68552e98be6b503b4e
+      path: data/experiments/predict/pl-court-instruct/Unsloth-Mistral-7B-Instruct-v0.3-fine-tuned/outputs_997.json
+      size: 1790731
+  predict_pl@pl-court-instruct-Unsloth-Mistral-Nemo-Instruct-2407-42:
+    cmd: PYTHONPATH=. python scripts/sft/predict.py dataset=pl-court-instruct model=Unsloth-Mistral-Nemo-Instruct-2407
+      random_seed=42 output_file=data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/outputs_42.json
     deps:
-    - path: data/experiments/predict/pl-court-instruct/open_ai_gpt-4o/outputs_997.json
-      hash: md5
-      md5: 7c5833fdd1419163b286baaa3d71e084
-      size: 1965252
-    - path: scripts/sft/evaluate_llm_as_judge.py
-      hash: md5
-      md5: 1e556a79b0f9cf0a9cfdb23ed8077bcc
-      size: 2172
+    - hash: md5
+      md5: ca5ac52e503c9f488f98f569811c76dc
+      path: configs/model/Unsloth-Mistral-Nemo-Instruct-2407.yaml
+      size: 261
+    - hash: md5
+      md5: 5fc8b9ac571d4a2209d7d866697252ab
+      path: configs/predict.yaml
+      size: 402
+    - hash: md5
+      md5: f9acd63cd4d682ae2242d7b51f0d974b
+      path: scripts/sft/predict.py
+      size: 3198
     outs:
-    - path: data/experiments/predict/pl-court-instruct/open_ai_gpt-4o/judge_metrics_997.json
-      hash: md5
-      md5: 867f10aeb55a3bd46b08c8a75c3bfc60
-      size: 1176
-  evaluate_llm_as_judge_api_models@pl-court-instruct-gpt_4o_mini-open_ai_gpt-4o-mini-997:
-    cmd: PYTHONPATH=. python scripts/sft/evaluate_llm_as_judge.py api_model=gpt_4o_mini
-      answers_file=data/experiments/predict/pl-court-instruct/open_ai_gpt-4o-mini/outputs_997.json
-      out_metric_file=data/experiments/predict/pl-court-instruct/open_ai_gpt-4o-mini/judge_metrics_997.json
+    - hash: md5
+      md5: 1385f49966e9db2a88a17f53d0887ad8
+      path: data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/outputs_42.json
+      size: 1741944
+  predict_pl@pl-court-instruct-Unsloth-Mistral-Nemo-Instruct-2407-7312:
+    cmd: PYTHONPATH=. python scripts/sft/predict.py dataset=pl-court-instruct model=Unsloth-Mistral-Nemo-Instruct-2407
+      random_seed=7312 output_file=data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/outputs_7312.json
     deps:
-    - path: data/experiments/predict/pl-court-instruct/open_ai_gpt-4o-mini/outputs_997.json
-      hash: md5
-      md5: 839c911f542cd7c60c9ae52ef95e9907
-      size: 1812429
-    - path: scripts/sft/evaluate_llm_as_judge.py
-      hash: md5
-      md5: 1e556a79b0f9cf0a9cfdb23ed8077bcc
-      size: 2172
-    outs:
-    - path: 
-        data/experiments/predict/pl-court-instruct/open_ai_gpt-4o-mini/judge_metrics_997.json
-      hash: md5
-      md5: 24037233e5abe74fe13f69dd4fc5e26a
-      size: 1173
-  evaluate_llm_as_judge_api_models@en-court-instruct-gpt_4o_mini-open_ai_gpt-4o-997:
-    cmd: PYTHONPATH=. python scripts/sft/evaluate_llm_as_judge.py api_model=gpt_4o_mini
-      answers_file=data/experiments/predict/en-court-instruct/open_ai_gpt-4o/outputs_997.json
-      out_metric_file=data/experiments/predict/en-court-instruct/open_ai_gpt-4o/judge_metrics_997.json
+    - hash: md5
+      md5: ca5ac52e503c9f488f98f569811c76dc
+      path: configs/model/Unsloth-Mistral-Nemo-Instruct-2407.yaml
+      size: 261
+    - hash: md5
+      md5: 5fc8b9ac571d4a2209d7d866697252ab
+      path: configs/predict.yaml
+      size: 402
+    - hash: md5
+      md5: f9acd63cd4d682ae2242d7b51f0d974b
+      path: scripts/sft/predict.py
+      size: 3198
+    outs:
+    - hash: md5
+      md5: 924744efce1483e9128579cad7a4454c
+      path: data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/outputs_7312.json
+      size: 1748772
+  predict_pl@pl-court-instruct-Unsloth-Mistral-Nemo-Instruct-2407-997:
+    cmd: PYTHONPATH=. python scripts/sft/predict.py dataset=pl-court-instruct model=Unsloth-Mistral-Nemo-Instruct-2407
+      random_seed=997 output_file=data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/outputs_997.json
     deps:
-    - path: data/experiments/predict/en-court-instruct/open_ai_gpt-4o/outputs_997.json
-      hash: md5
-      md5: 8f70e2baa0b0ae8a320577f5c8a60011
-      size: 679432
-    - path: scripts/sft/evaluate_llm_as_judge.py
-      hash: md5
-      md5: 1e556a79b0f9cf0a9cfdb23ed8077bcc
-      size: 2172
-    outs:
-    - path: data/experiments/predict/en-court-instruct/open_ai_gpt-4o/judge_metrics_997.json
-      hash: md5
-      md5: 41921cec37a7e162f73e7a0d1e106eb1
-      size: 482
-  evaluate_llm_as_judge_api_models@en-court-instruct-gpt_4o_mini-open_ai_gpt-4o-mini-997:
-    cmd: PYTHONPATH=. python scripts/sft/evaluate_llm_as_judge.py api_model=gpt_4o_mini
-      answers_file=data/experiments/predict/en-court-instruct/open_ai_gpt-4o-mini/outputs_997.json
-      out_metric_file=data/experiments/predict/en-court-instruct/open_ai_gpt-4o-mini/judge_metrics_997.json
+    - hash: md5
+      md5: ca5ac52e503c9f488f98f569811c76dc
+      path: configs/model/Unsloth-Mistral-Nemo-Instruct-2407.yaml
+      size: 261
+    - hash: md5
+      md5: 5fc8b9ac571d4a2209d7d866697252ab
+      path: configs/predict.yaml
+      size: 402
+    - hash: md5
+      md5: f9acd63cd4d682ae2242d7b51f0d974b
+      path: scripts/sft/predict.py
+      size: 3198
+    outs:
+    - hash: md5
+      md5: 4d023797a9053fd7df61f6b1796112e9
+      path: data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/outputs_997.json
+      size: 1747404
+  predict_pl@pl-court-instruct-Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned-42:
+    cmd: PYTHONPATH=. python scripts/sft/predict.py dataset=pl-court-instruct model=Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned
+      random_seed=42 output_file=data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned/outputs_42.json
     deps:
-    - path: data/experiments/predict/en-court-instruct/open_ai_gpt-4o-mini/outputs_997.json
-      hash: md5
-      md5: 2a0819011b3eac56e497201a9f67e310
-      size: 690306
-    - path: scripts/sft/evaluate_llm_as_judge.py
-      hash: md5
-      md5: 1e556a79b0f9cf0a9cfdb23ed8077bcc
-      size: 2172
-    outs:
-    - path: 
-        data/experiments/predict/en-court-instruct/open_ai_gpt-4o-mini/judge_metrics_997.json
-      hash: md5
-      md5: 4edc8fe239f53890d71291f61b6cc96c
-      size: 486
-  evaluate_llm_as_judge_pl@gpt_4o_mini-Unsloth-Llama-3-8B-Instruct-42:
-    cmd: PYTHONPATH=. python scripts/sft/evaluate_llm_as_judge.py api_model=gpt_4o_mini
-      answers_file=data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct/outputs_42.json
-      out_metric_file=data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct/judge_metrics_42.json
-      prompt=pl
+    - hash: md5
+      md5: 1d9e6407d121214f949d56ca5c3425f5
+      path: configs/model/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned.yaml
+      size: 367
+    - hash: md5
+      md5: 5fc8b9ac571d4a2209d7d866697252ab
+      path: configs/predict.yaml
+      size: 402
+    - hash: md5
+      md5: f9acd63cd4d682ae2242d7b51f0d974b
+      path: scripts/sft/predict.py
+      size: 3198
+    outs:
+    - hash: md5
+      md5: 14d4613f7d9495f5fb5f2d7b81f402a9
+      path: data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned/outputs_42.json
+      size: 1825646
+  predict_pl@pl-court-instruct-Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned-7312:
+    cmd: PYTHONPATH=. python scripts/sft/predict.py dataset=pl-court-instruct model=Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned
+      random_seed=7312 output_file=data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned/outputs_7312.json
     deps:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct/outputs_42.json
-      hash: md5
-      md5: e99c88720116c951087b6125e5f4be4d
-      size: 2008073
-    - path: scripts/sft/evaluate_llm_as_judge.py
-      hash: md5
-      md5: 79a02fb864cb279f93fc4171043bb31c
-      size: 2253
+    - hash: md5
+      md5: 1d9e6407d121214f949d56ca5c3425f5
+      path: configs/model/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned.yaml
+      size: 367
+    - hash: md5
+      md5: 5fc8b9ac571d4a2209d7d866697252ab
+      path: configs/predict.yaml
+      size: 402
+    - hash: md5
+      md5: f9acd63cd4d682ae2242d7b51f0d974b
+      path: scripts/sft/predict.py
+      size: 3198
     outs:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct/judge_metrics_42.json
-      hash: md5
-      md5: 9d9fba0cf2169e9dd9f69579a2182b8e
-      size: 1172
-  evaluate_llm_as_judge_pl@gpt_4o_mini-Unsloth-Llama-3-8B-Instruct-7312:
-    cmd: PYTHONPATH=. python scripts/sft/evaluate_llm_as_judge.py api_model=gpt_4o_mini
-      answers_file=data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct/outputs_7312.json
-      out_metric_file=data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct/judge_metrics_7312.json
-      prompt=pl
+    - hash: md5
+      md5: 302e1dc4f064007e3df88ac1e8acccc5
+      path: data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned/outputs_7312.json
+      size: 1831330
+  predict_pl@pl-court-instruct-Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned-997:
+    cmd: PYTHONPATH=. python scripts/sft/predict.py dataset=pl-court-instruct model=Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned
+      random_seed=997 output_file=data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned/outputs_997.json
     deps:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct/outputs_7312.json
-      hash: md5
-      md5: 4c25368aacb7402b1b2cae9368d187d1
-      size: 2013637
-    - path: scripts/sft/evaluate_llm_as_judge.py
-      hash: md5
-      md5: 79a02fb864cb279f93fc4171043bb31c
-      size: 2253
+    - hash: md5
+      md5: 1d9e6407d121214f949d56ca5c3425f5
+      path: configs/model/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned.yaml
+      size: 367
+    - hash: md5
+      md5: 5fc8b9ac571d4a2209d7d866697252ab
+      path: configs/predict.yaml
+      size: 402
+    - hash: md5
+      md5: f9acd63cd4d682ae2242d7b51f0d974b
+      path: scripts/sft/predict.py
+      size: 3198
     outs:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct/judge_metrics_7312.json
-      hash: md5
-      md5: e58171fc082d33c84497a13dabcf766c
-      size: 1167
-  evaluate_llm_as_judge_pl@gpt_4o_mini-Unsloth-Llama-3-8B-Instruct-997:
-    cmd: PYTHONPATH=. python scripts/sft/evaluate_llm_as_judge.py api_model=gpt_4o_mini
-      answers_file=data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct/outputs_997.json
-      out_metric_file=data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct/judge_metrics_997.json
-      prompt=pl
+    - hash: md5
+      md5: 41a47dc56efc29b6c2771db68bdacb17
+      path: data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned/outputs_997.json
+      size: 1822491
+  predict_pl@pl-court-instruct-qra-13b-42:
+    cmd: PYTHONPATH=. python scripts/sft/predict.py dataset=pl-court-instruct model=qra-13b
+      random_seed=42 output_file=data/experiments/predict/pl-court-instruct/qra-13b/outputs_42.json
     deps:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct/outputs_997.json
-      hash: md5
-      md5: baef589507248af212aaae51602fd999
-      size: 2010150
-    - path: scripts/sft/evaluate_llm_as_judge.py
-      hash: md5
-      md5: 79a02fb864cb279f93fc4171043bb31c
-      size: 2253
+    - hash: md5
+      md5: ab2baba7b6109364d7e04c77232b0f9d
+      path: configs/model/qra-13b.yaml
+      size: 152
+    - hash: md5
+      md5: 5fc8b9ac571d4a2209d7d866697252ab
+      path: configs/predict.yaml
+      size: 402
+    - hash: md5
+      md5: f9acd63cd4d682ae2242d7b51f0d974b
+      path: scripts/sft/predict.py
+      size: 3198
     outs:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct/judge_metrics_997.json
-      hash: md5
-      md5: f8d16a5298fabe288486822779470cd8
-      size: 1165
-  evaluate_llm_as_judge_pl@gpt_4o_mini-Unsloth-Llama-3-8B-Instruct-fine-tuned-42:
-    cmd: PYTHONPATH=. python scripts/sft/evaluate_llm_as_judge.py api_model=gpt_4o_mini
-      answers_file=data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct-fine-tuned/outputs_42.json
-      out_metric_file=data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct-fine-tuned/judge_metrics_42.json
-      prompt=pl
+    - hash: md5
+      md5: dd142d2d1c24c499bbe615bf4b74525c
+      path: data/experiments/predict/pl-court-instruct/qra-13b/outputs_42.json
+      size: 2247396
+  predict_pl@pl-court-instruct-qra-13b-7312:
+    cmd: PYTHONPATH=. python scripts/sft/predict.py dataset=pl-court-instruct model=qra-13b
+      random_seed=7312 output_file=data/experiments/predict/pl-court-instruct/qra-13b/outputs_7312.json
     deps:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct-fine-tuned/outputs_42.json
-      hash: md5
-      md5: 289b719e8c7166e578417e5706bdc4e3
-      size: 1760355
-    - path: scripts/sft/evaluate_llm_as_judge.py
-      hash: md5
-      md5: 79a02fb864cb279f93fc4171043bb31c
-      size: 2253
+    - hash: md5
+      md5: ab2baba7b6109364d7e04c77232b0f9d
+      path: configs/model/qra-13b.yaml
+      size: 152
+    - hash: md5
+      md5: 5fc8b9ac571d4a2209d7d866697252ab
+      path: configs/predict.yaml
+      size: 402
+    - hash: md5
+      md5: f9acd63cd4d682ae2242d7b51f0d974b
+      path: scripts/sft/predict.py
+      size: 3198
     outs:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct-fine-tuned/judge_metrics_42.json
-      hash: md5
-      md5: 70398042d030309e7e0bc7ba927136f3
-      size: 1167
-  evaluate_llm_as_judge_pl@gpt_4o_mini-Unsloth-Llama-3-8B-Instruct-fine-tuned-7312:
-    cmd: PYTHONPATH=. python scripts/sft/evaluate_llm_as_judge.py api_model=gpt_4o_mini
-      answers_file=data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct-fine-tuned/outputs_7312.json
-      out_metric_file=data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct-fine-tuned/judge_metrics_7312.json
-      prompt=pl
+    - hash: md5
+      md5: fddb307b29b598df3786fc94d479e918
+      path: data/experiments/predict/pl-court-instruct/qra-13b/outputs_7312.json
+      size: 2254243
+  predict_pl@pl-court-instruct-qra-13b-997:
+    cmd: PYTHONPATH=. python scripts/sft/predict.py dataset=pl-court-instruct model=qra-13b
+      random_seed=997 output_file=data/experiments/predict/pl-court-instruct/qra-13b/outputs_997.json
     deps:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct-fine-tuned/outputs_7312.json
-      hash: md5
-      md5: 25bee3b4ee09b36d636095b4c927a0d3
-      size: 1759194
-    - path: scripts/sft/evaluate_llm_as_judge.py
-      hash: md5
-      md5: 79a02fb864cb279f93fc4171043bb31c
-      size: 2253
+    - hash: md5
+      md5: ab2baba7b6109364d7e04c77232b0f9d
+      path: configs/model/qra-13b.yaml
+      size: 152
+    - hash: md5
+      md5: 5fc8b9ac571d4a2209d7d866697252ab
+      path: configs/predict.yaml
+      size: 402
+    - hash: md5
+      md5: f9acd63cd4d682ae2242d7b51f0d974b
+      path: scripts/sft/predict.py
+      size: 3198
     outs:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct-fine-tuned/judge_metrics_7312.json
-      hash: md5
-      md5: 9d22089c8d23bbc5a028c748e5522c23
-      size: 1157
-  evaluate_llm_as_judge_pl@gpt_4o_mini-Unsloth-Llama-3-8B-Instruct-fine-tuned-997:
-    cmd: PYTHONPATH=. python scripts/sft/evaluate_llm_as_judge.py api_model=gpt_4o_mini
-      answers_file=data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct-fine-tuned/outputs_997.json
-      out_metric_file=data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct-fine-tuned/judge_metrics_997.json
-      prompt=pl
+    - hash: md5
+      md5: 72ef8a411b8f5aeb006c99e5868c754d
+      path: data/experiments/predict/pl-court-instruct/qra-13b/outputs_997.json
+      size: 2252480
+  predict_pl@pl-court-instruct-trurl-13B-academic-42:
+    cmd: PYTHONPATH=. python scripts/sft/predict.py dataset=pl-court-instruct model=trurl-13B-academic
+      random_seed=42 output_file=data/experiments/predict/pl-court-instruct/trurl-13B-academic/outputs_42.json
     deps:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct-fine-tuned/outputs_997.json
-      hash: md5
-      md5: 82b2c535d99d91b9a34986375bfa31a9
-      size: 1758747
-    - path: scripts/sft/evaluate_llm_as_judge.py
-      hash: md5
-      md5: 79a02fb864cb279f93fc4171043bb31c
-      size: 2253
+    - hash: md5
+      md5: 3aa3ce4fc9a1958bef82b8dbfd44ab6b
+      path: configs/model/trurl-13B-academic.yaml
+      size: 168
+    - hash: md5
+      md5: 5fc8b9ac571d4a2209d7d866697252ab
+      path: configs/predict.yaml
+      size: 402
+    - hash: md5
+      md5: f9acd63cd4d682ae2242d7b51f0d974b
+      path: scripts/sft/predict.py
+      size: 3198
     outs:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Unsloth-Llama-3-8B-Instruct-fine-tuned/judge_metrics_997.json
-      hash: md5
-      md5: 4222d5b165de8a3a89d71d6519b71b76
-      size: 1170
-  evaluate_llm_as_judge_pl@gpt_4o_mini-Unsloth-Mistral-Nemo-Instruct-2407-42:
-    cmd: PYTHONPATH=. python scripts/sft/evaluate_llm_as_judge.py api_model=gpt_4o_mini
-      answers_file=data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/outputs_42.json
-      out_metric_file=data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/judge_metrics_42.json
-      prompt=pl
+    - hash: md5
+      md5: bb571102170940efc73f02143a530d5b
+      path: data/experiments/predict/pl-court-instruct/trurl-13B-academic/outputs_42.json
+      size: 1289839
+  predict_pl@pl-court-instruct-trurl-13B-academic-7312:
+    cmd: PYTHONPATH=. python scripts/sft/predict.py dataset=pl-court-instruct model=trurl-13B-academic
+      random_seed=7312 output_file=data/experiments/predict/pl-court-instruct/trurl-13B-academic/outputs_7312.json
     deps:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/outputs_42.json
-      hash: md5
-      md5: 1385f49966e9db2a88a17f53d0887ad8
-      size: 1741944
-    - path: scripts/sft/evaluate_llm_as_judge.py
-      hash: md5
-      md5: 79a02fb864cb279f93fc4171043bb31c
-      size: 2253
+    - hash: md5
+      md5: 3aa3ce4fc9a1958bef82b8dbfd44ab6b
+      path: configs/model/trurl-13B-academic.yaml
+      size: 168
+    - hash: md5
+      md5: 5fc8b9ac571d4a2209d7d866697252ab
+      path: configs/predict.yaml
+      size: 402
+    - hash: md5
+      md5: f9acd63cd4d682ae2242d7b51f0d974b
+      path: scripts/sft/predict.py
+      size: 3198
     outs:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/judge_metrics_42.json
-      hash: md5
-      md5: f4bac633a65afde9bf5612f35c3089bb
-      size: 1170
-  evaluate_llm_as_judge_pl@gpt_4o_mini-Unsloth-Mistral-Nemo-Instruct-2407-7312:
-    cmd: PYTHONPATH=. python scripts/sft/evaluate_llm_as_judge.py api_model=gpt_4o_mini
-      answers_file=data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/outputs_7312.json
-      out_metric_file=data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/judge_metrics_7312.json
-      prompt=pl
+    - hash: md5
+      md5: bcd41ca4629d4cec2440a8ed2f02560f
+      path: data/experiments/predict/pl-court-instruct/trurl-13B-academic/outputs_7312.json
+      size: 1283974
+  predict_pl@pl-court-instruct-trurl-13B-academic-997:
+    cmd: PYTHONPATH=. python scripts/sft/predict.py dataset=pl-court-instruct model=trurl-13B-academic
+      random_seed=997 output_file=data/experiments/predict/pl-court-instruct/trurl-13B-academic/outputs_997.json
     deps:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/outputs_7312.json
-      hash: md5
-      md5: 924744efce1483e9128579cad7a4454c
-      size: 1748772
-    - path: scripts/sft/evaluate_llm_as_judge.py
-      hash: md5
-      md5: 79a02fb864cb279f93fc4171043bb31c
-      size: 2253
+    - hash: md5
+      md5: 3aa3ce4fc9a1958bef82b8dbfd44ab6b
+      path: configs/model/trurl-13B-academic.yaml
+      size: 168
+    - hash: md5
+      md5: 5fc8b9ac571d4a2209d7d866697252ab
+      path: configs/predict.yaml
+      size: 402
+    - hash: md5
+      md5: f9acd63cd4d682ae2242d7b51f0d974b
+      path: scripts/sft/predict.py
+      size: 3198
     outs:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/judge_metrics_7312.json
-      hash: md5
-      md5: 1f95777ef87a547fa7a41dc597adfc39
-      size: 1166
-  evaluate_llm_as_judge_pl@gpt_4o_mini-Unsloth-Mistral-Nemo-Instruct-2407-997:
-    cmd: PYTHONPATH=. python scripts/sft/evaluate_llm_as_judge.py api_model=gpt_4o_mini
-      answers_file=data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/outputs_997.json
-      out_metric_file=data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/judge_metrics_997.json
-      prompt=pl
+    - hash: md5
+      md5: 731cff0eb1484682de211336efeff153
+      path: data/experiments/predict/pl-court-instruct/trurl-13B-academic/outputs_997.json
+      size: 1288941
+  predict_with_api@en-court-instruct-gpt-4o-997:
+    cmd: PYTHONPATH=. python scripts/sft/predict_with_api.py dataset=en-court-instruct
+      model_version=gpt-4o seed=997 output_file=data/experiments/predict/en-court-instruct/open_ai_gpt-4o/outputs_997.json
     deps:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/outputs_997.json
-      hash: md5
-      md5: 4d023797a9053fd7df61f6b1796112e9
-      size: 1747404
-    - path: scripts/sft/evaluate_llm_as_judge.py
-      hash: md5
-      md5: 79a02fb864cb279f93fc4171043bb31c
-      size: 2253
+    - hash: md5
+      md5: aff18078742a14c3d8ce2cd74e718d44
+      path: configs/predict_with_api.yaml
+      size: 320
+    - hash: md5
+      md5: 142508c7b6df391083b0e81a3a6c4795
+      path: scripts/sft/predict_with_api.py
+      size: 3968
     outs:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407/judge_metrics_997.json
-      hash: md5
-      md5: de3f557dfdf3440262e4d8f811e526ca
-      size: 1167
-  evaluate_llm_as_judge_pl@gpt_4o_mini-Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned-42:
-    cmd: PYTHONPATH=. python scripts/sft/evaluate_llm_as_judge.py api_model=gpt_4o_mini
-      answers_file=data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned/outputs_42.json
-      out_metric_file=data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned/judge_metrics_42.json
-      prompt=pl
+    - hash: md5
+      md5: 8f70e2baa0b0ae8a320577f5c8a60011
+      path: data/experiments/predict/en-court-instruct/open_ai_gpt-4o/outputs_997.json
+      size: 679432
+  predict_with_api@en-court-instruct-gpt-4o-mini-997:
+    cmd: PYTHONPATH=. python scripts/sft/predict_with_api.py dataset=en-court-instruct
+      model_version=gpt-4o-mini seed=997 output_file=data/experiments/predict/en-court-instruct/open_ai_gpt-4o-mini/outputs_997.json
     deps:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned/outputs_42.json
-      hash: md5
-      md5: 14d4613f7d9495f5fb5f2d7b81f402a9
-      size: 1825646
-    - path: scripts/sft/evaluate_llm_as_judge.py
-      hash: md5
-      md5: 79a02fb864cb279f93fc4171043bb31c
-      size: 2253
+    - hash: md5
+      md5: aff18078742a14c3d8ce2cd74e718d44
+      path: configs/predict_with_api.yaml
+      size: 320
+    - hash: md5
+      md5: 142508c7b6df391083b0e81a3a6c4795
+      path: scripts/sft/predict_with_api.py
+      size: 3968
     outs:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned/judge_metrics_42.json
-      hash: md5
-      md5: e8cff190991ee3164825dbf7eca03d12
-      size: 1170
-  evaluate_llm_as_judge_pl@gpt_4o_mini-Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned-7312:
-    cmd: PYTHONPATH=. python scripts/sft/evaluate_llm_as_judge.py api_model=gpt_4o_mini
-      answers_file=data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned/outputs_7312.json
-      out_metric_file=data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned/judge_metrics_7312.json
-      prompt=pl
+    - hash: md5
+      md5: 2a0819011b3eac56e497201a9f67e310
+      path: data/experiments/predict/en-court-instruct/open_ai_gpt-4o-mini/outputs_997.json
+      size: 690306
+  predict_with_api@pl-court-instruct-gpt-4o-997:
+    cmd: PYTHONPATH=. python scripts/sft/predict_with_api.py dataset=pl-court-instruct
+      model_version=gpt-4o seed=997 output_file=data/experiments/predict/pl-court-instruct/open_ai_gpt-4o/outputs_997.json
     deps:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned/outputs_7312.json
-      hash: md5
-      md5: 302e1dc4f064007e3df88ac1e8acccc5
-      size: 1831330
-    - path: scripts/sft/evaluate_llm_as_judge.py
-      hash: md5
-      md5: 79a02fb864cb279f93fc4171043bb31c
-      size: 2253
+    - hash: md5
+      md5: aff18078742a14c3d8ce2cd74e718d44
+      path: configs/predict_with_api.yaml
+      size: 320
+    - hash: md5
+      md5: 142508c7b6df391083b0e81a3a6c4795
+      path: scripts/sft/predict_with_api.py
+      size: 3968
     outs:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned/judge_metrics_7312.json
-      hash: md5
-      md5: aee4a08e0a4d0398b34a2587c039244d
-      size: 1169
-  evaluate_llm_as_judge_pl@gpt_4o_mini-Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned-997:
-    cmd: PYTHONPATH=. python scripts/sft/evaluate_llm_as_judge.py api_model=gpt_4o_mini
-      answers_file=data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned/outputs_997.json
-      out_metric_file=data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned/judge_metrics_997.json
-      prompt=pl
+    - hash: md5
+      md5: 7c5833fdd1419163b286baaa3d71e084
+      path: data/experiments/predict/pl-court-instruct/open_ai_gpt-4o/outputs_997.json
+      size: 1965252
+  predict_with_api@pl-court-instruct-gpt-4o-mini-997:
+    cmd: PYTHONPATH=. python scripts/sft/predict_with_api.py dataset=pl-court-instruct
+      model_version=gpt-4o-mini seed=997 output_file=data/experiments/predict/pl-court-instruct/open_ai_gpt-4o-mini/outputs_997.json
     deps:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned/outputs_997.json
-      hash: md5
-      md5: 41a47dc56efc29b6c2771db68bdacb17
-      size: 1822491
-    - path: scripts/sft/evaluate_llm_as_judge.py
-      hash: md5
-      md5: 79a02fb864cb279f93fc4171043bb31c
-      size: 2253
+    - hash: md5
+      md5: aff18078742a14c3d8ce2cd74e718d44
+      path: configs/predict_with_api.yaml
+      size: 320
+    - hash: md5
+      md5: 142508c7b6df391083b0e81a3a6c4795
+      path: scripts/sft/predict_with_api.py
+      size: 3968
     outs:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned/judge_metrics_997.json
-      hash: md5
-      md5: aac703269b10c85d1a2b5303c22ca077
-      size: 1168
-  evaluate_llm_as_judge_pl@gpt_4o_mini-Bielik-7B-Instruct-v0.1-42:
-    cmd: PYTHONPATH=. python scripts/sft/evaluate_llm_as_judge.py api_model=gpt_4o_mini
-      answers_file=data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1/outputs_42.json
-      out_metric_file=data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1/judge_metrics_42.json
-      prompt=pl
+    - hash: md5
+      md5: 839c911f542cd7c60c9ae52ef95e9907
+      path: data/experiments/predict/pl-court-instruct/open_ai_gpt-4o-mini/outputs_997.json
+      size: 1812429
+  raw_dataset_readme:
+    cmd: jupyter nbconvert --no-input --to markdown --execute 'nbs/Dataset Cards/01_Dataset_Description_Raw.ipynb'
+      --output-dir data/datasets/pl/readme/raw --output README
     deps:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1/outputs_42.json
-      hash: md5
-      md5: 2dc39513a04910c5d0c54380166639d9
-      size: 2029644
-    - path: scripts/sft/evaluate_llm_as_judge.py
-      hash: md5
-      md5: 79a02fb864cb279f93fc4171043bb31c
-      size: 2253
+    - hash: md5
+      md5: 622ba21868561c26fb6877ad95bfb5c5.dir
+      nfiles: 17
+      path: data/datasets/pl/raw
+      size: 10234505621
+    - hash: md5
+      md5: 11b39233ef419de713493cb5ec8bcfd9
+      path: nbs/Dataset Cards/01_Dataset_Description_Raw.ipynb
+      size: 77118
     outs:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1/judge_metrics_42.json
-      hash: md5
-      md5: 243da4df07c6dfb5199b925e3f5c07aa
-      size: 1137
-  evaluate_llm_as_judge_pl@gpt_4o_mini-Bielik-7B-Instruct-v0.1-7312:
-    cmd: PYTHONPATH=. python scripts/sft/evaluate_llm_as_judge.py api_model=gpt_4o_mini
-      answers_file=data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1/outputs_7312.json
-      out_metric_file=data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1/judge_metrics_7312.json
-      prompt=pl
+    - hash: md5
+      md5: c82b8238e3043491c6fa49e9641e8dac.dir
+      nfiles: 8
+      path: data/datasets/pl/readme/raw/
+      size: 475420
+  sft_unsloth@en-court-instruct-Unsloth-Llama-3-8B-Instruct:
+    cmd: PYTHONPATH=. python scripts/sft/fine_tune_llm.py dataset=en-court-instruct
+      model=Unsloth-Llama-3-8B-Instruct
     deps:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1/outputs_7312.json
-      hash: md5
-      md5: ae39bf31296ffe82c0f6a3e8c9ff63aa
-      size: 2014399
-    - path: scripts/sft/evaluate_llm_as_judge.py
-      hash: md5
-      md5: 79a02fb864cb279f93fc4171043bb31c
-      size: 2253
+    - hash: md5
+      md5: 3933c4faf5a478d0f9d3963c3b29e5cc
+      path: configs/fine_tuning.yaml
+      size: 1356
+    - hash: md5
+      md5: 56a95874b3e77e7ffec11c00330da5b6
+      path: configs/model/Unsloth-Llama-3-8B-Instruct.yaml
+      size: 176
+    - hash: md5
+      md5: 4b77ee1ea604cae18f17ca00cdb6988b
+      path: scripts/sft/fine_tune_llm.py
+      size: 4578
     outs:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1/judge_metrics_7312.json
-      hash: md5
-      md5: 8098cc937d57455ca47d32c3449159a3
-      size: 1129
-  evaluate_llm_as_judge_pl@gpt_4o_mini-Bielik-7B-Instruct-v0.1-997:
-    cmd: PYTHONPATH=. python scripts/sft/evaluate_llm_as_judge.py api_model=gpt_4o_mini
-      answers_file=data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1/outputs_997.json
-      out_metric_file=data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1/judge_metrics_997.json
-      prompt=pl
+    - hash: md5
+      md5: c99c2a68274325db86fbbd41bcc30e78.dir
+      nfiles: 18
+      path: data/experiments/fine-tune/Unsloth-Llama-3-8B-Instruct/en-court-instruct/
+      size: 354395477
+  sft_unsloth@en-court-instruct-Unsloth-Mistral-Nemo-Instruct-2407:
+    cmd: PYTHONPATH=. python scripts/sft/fine_tune_llm.py dataset=en-court-instruct
+      model=Unsloth-Mistral-Nemo-Instruct-2407
     deps:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1/outputs_997.json
-      hash: md5
-      md5: fac04d78ad020b50f79fc7277a037e8e
-      size: 2016400
-    - path: scripts/sft/evaluate_llm_as_judge.py
-      hash: md5
-      md5: 79a02fb864cb279f93fc4171043bb31c
-      size: 2253
+    - hash: md5
+      md5: 3933c4faf5a478d0f9d3963c3b29e5cc
+      path: configs/fine_tuning.yaml
+      size: 1356
+    - hash: md5
+      md5: ca5ac52e503c9f488f98f569811c76dc
+      path: configs/model/Unsloth-Mistral-Nemo-Instruct-2407.yaml
+      size: 261
+    - hash: md5
+      md5: 4b77ee1ea604cae18f17ca00cdb6988b
+      path: scripts/sft/fine_tune_llm.py
+      size: 4578
     outs:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1/judge_metrics_997.json
-      hash: md5
-      md5: f1390b2d50893a17c90fc277dc363d6a
-      size: 1139
-  evaluate_llm_as_judge_pl@gpt_4o_mini-Bielik-7B-Instruct-v0.1-fine-tuned-42:
-    cmd: PYTHONPATH=. python scripts/sft/evaluate_llm_as_judge.py api_model=gpt_4o_mini
-      answers_file=data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1-fine-tuned/outputs_42.json
-      out_metric_file=data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1-fine-tuned/judge_metrics_42.json
-      prompt=pl
+    - hash: md5
+      md5: 4c4f973ee0648610fc4b696059fae47a.dir
+      nfiles: 18
+      path: data/experiments/fine-tune/Unsloth-Mistral-Nemo-Instruct-2407/en-court-instruct/
+      size: 475726484
+  sft_unsloth@pl-court-instruct-Bielik-11B-v2.2-Instruct:
+    cmd: PYTHONPATH=. python scripts/sft/fine_tune_llm.py dataset=pl-court-instruct
+      model=Bielik-11B-v2.2-Instruct
     deps:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1-fine-tuned/outputs_42.json
-      hash: md5
-      md5: 178eb0649617d4a698da6c9e315e84c5
-      size: 2034749
-    - path: scripts/sft/evaluate_llm_as_judge.py
-      hash: md5
-      md5: 79a02fb864cb279f93fc4171043bb31c
-      size: 2253
+    - hash: md5
+      md5: 3933c4faf5a478d0f9d3963c3b29e5cc
+      path: configs/fine_tuning.yaml
+      size: 1356
+    - hash: md5
+      md5: 1cfb3fbe30fac3e07a30339e6bf197c9
+      path: configs/model/Bielik-11B-v2.2-Instruct.yaml
+      size: 175
+    - hash: md5
+      md5: 4b77ee1ea604cae18f17ca00cdb6988b
+      path: scripts/sft/fine_tune_llm.py
+      size: 4578
     outs:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1-fine-tuned/judge_metrics_42.json
-      hash: md5
-      md5: 302b957707520fa327d1da0edf18baa3
-      size: 1167
-  evaluate_llm_as_judge_pl@gpt_4o_mini-Bielik-7B-Instruct-v0.1-fine-tuned-7312:
-    cmd: PYTHONPATH=. python scripts/sft/evaluate_llm_as_judge.py api_model=gpt_4o_mini
-      answers_file=data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1-fine-tuned/outputs_7312.json
-      out_metric_file=data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1-fine-tuned/judge_metrics_7312.json
-      prompt=pl
+    - hash: md5
+      md5: 6acbafad481c0a8d3e6a989aa50dce46.dir
+      nfiles: 39
+      path: data/experiments/fine-tune/Bielik-11B-v2.2-Instruct/pl-court-instruct/
+      size: 1189374238
+  sft_unsloth@pl-court-instruct-Bielik-7B-Instruct-v0.1:
+    cmd: PYTHONPATH=. python scripts/sft/fine_tune_llm.py dataset=pl-court-instruct
+      model=Bielik-7B-Instruct-v0.1
     deps:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1-fine-tuned/outputs_7312.json
-      hash: md5
-      md5: 743ea22448bc73a7a991da075fca8841
-      size: 2031343
-    - path: scripts/sft/evaluate_llm_as_judge.py
-      hash: md5
-      md5: 79a02fb864cb279f93fc4171043bb31c
-      size: 2253
+    - hash: md5
+      md5: 3933c4faf5a478d0f9d3963c3b29e5cc
+      path: configs/fine_tuning.yaml
+      size: 1356
+    - hash: md5
+      md5: c3412525e9819b53fbad06363a07a871
+      path: configs/model/Bielik-7B-Instruct-v0.1.yaml
+      size: 173
+    - hash: md5
+      md5: 4b77ee1ea604cae18f17ca00cdb6988b
+      path: scripts/sft/fine_tune_llm.py
+      size: 4578
     outs:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1-fine-tuned/judge_metrics_7312.json
-      hash: md5
-      md5: 789f0906846251d3f0cab78d111f9c56
-      size: 1163
-  evaluate_llm_as_judge_pl@gpt_4o_mini-Bielik-7B-Instruct-v0.1-fine-tuned-997:
-    cmd: PYTHONPATH=. python scripts/sft/evaluate_llm_as_judge.py api_model=gpt_4o_mini
-      answers_file=data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1-fine-tuned/outputs_997.json
-      out_metric_file=data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1-fine-tuned/judge_metrics_997.json
-      prompt=pl
+    - hash: md5
+      md5: be61ab5ea1365c1bcf908952bc015ab4.dir
+      nfiles: 108
+      path: data/experiments/fine-tune/Bielik-7B-Instruct-v0.1/pl-court-instruct/
+      size: 2293711014
+  sft_unsloth@pl-court-instruct-Unsloth-Llama-3-8B-Instruct:
+    cmd: PYTHONPATH=. python scripts/sft/fine_tune_llm.py dataset=pl-court-instruct
+      model=Unsloth-Llama-3-8B-Instruct
     deps:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1-fine-tuned/outputs_997.json
-      hash: md5
-      md5: 433a4b2aa7870a134277a265d099a588
-      size: 2029482
-    - path: scripts/sft/evaluate_llm_as_judge.py
-      hash: md5
-      md5: 79a02fb864cb279f93fc4171043bb31c
-      size: 2253
+    - hash: md5
+      md5: 3933c4faf5a478d0f9d3963c3b29e5cc
+      path: configs/fine_tuning.yaml
+      size: 1356
+    - hash: md5
+      md5: 56a95874b3e77e7ffec11c00330da5b6
+      path: configs/model/Unsloth-Llama-3-8B-Instruct.yaml
+      size: 176
+    - hash: md5
+      md5: 4b77ee1ea604cae18f17ca00cdb6988b
+      path: scripts/sft/fine_tune_llm.py
+      size: 4578
     outs:
-    - path: 
-        data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1-fine-tuned/judge_metrics_997.json
-      hash: md5
-      md5: 90f3ed04ef29c5cd29b7ec8f02a780a1
-      size: 1163
-  evaluate_llm_as_judge_api_models@pl-gpt_4o_mini-open_ai_gpt-4o-997:
-    cmd: PYTHONPATH=. python scripts/sft/evaluate_llm_as_judge.py api_model=gpt_4o_mini
-      answers_file=data/experiments/predict/pl-court-instruct/open_ai_gpt-4o/outputs_997.json
-      out_metric_file=data/experiments/predict/pl-court-instruct/open_ai_gpt-4o/judge_metrics_997.json
-      prompt=pl
+    - hash: md5
+      md5: d9850d30d221f257e1453a66a6c1eef3.dir
+      nfiles: 33
+      path: data/experiments/fine-tune/Unsloth-Llama-3-8B-Instruct/pl-court-instruct/
+      size: 784320233
+  sft_unsloth@pl-court-instruct-Unsloth-Mistral-7B-Instruct-v0.3:
+    cmd: PYTHONPATH=. python scripts/sft/fine_tune_llm.py dataset=pl-court-instruct
+      model=Unsloth-Mistral-7B-Instruct-v0.3
     deps:
-    - path: data/experiments/predict/pl-court-instruct/open_ai_gpt-4o/outputs_997.json
-      hash: md5
-      md5: 7c5833fdd1419163b286baaa3d71e084
-      size: 1965252
-    - path: scripts/sft/evaluate_llm_as_judge.py
-      hash: md5
-      md5: 79a02fb864cb279f93fc4171043bb31c
-      size: 2253
+    - hash: md5
+      md5: 3933c4faf5a478d0f9d3963c3b29e5cc
+      path: configs/fine_tuning.yaml
+      size: 1356
+    - hash: md5
+      md5: d184e20107315876e7751bdc7c3841ad
+      path: configs/model/Unsloth-Mistral-7B-Instruct-v0.3.yaml
+      size: 182
+    - hash: md5
+      md5: 4b77ee1ea604cae18f17ca00cdb6988b
+      path: scripts/sft/fine_tune_llm.py
+      size: 4578
     outs:
-    - path: data/experiments/predict/pl-court-instruct/open_ai_gpt-4o/judge_metrics_997.json
-      hash: md5
-      md5: 867f10aeb55a3bd46b08c8a75c3bfc60
-      size: 1176
-  evaluate_llm_as_judge_api_models@pl-gpt_4o_mini-open_ai_gpt-4o-mini-997:
-    cmd: PYTHONPATH=. python scripts/sft/evaluate_llm_as_judge.py api_model=gpt_4o_mini
-      answers_file=data/experiments/predict/pl-court-instruct/open_ai_gpt-4o-mini/outputs_997.json
-      out_metric_file=data/experiments/predict/pl-court-instruct/open_ai_gpt-4o-mini/judge_metrics_997.json
-      prompt=pl
+    - hash: md5
+      md5: 1b47e8203c533942e1903dd816f7a7f7.dir
+      nfiles: 66
+      path: data/experiments/fine-tune/Unsloth-Mistral-7B-Instruct-v0.3/pl-court-instruct/
+      size: 1518954466
+  sft_unsloth@pl-court-instruct-Unsloth-Mistral-Nemo-Instruct-2407:
+    cmd: PYTHONPATH=. python scripts/sft/fine_tune_llm.py dataset=pl-court-instruct
+      model=Unsloth-Mistral-Nemo-Instruct-2407
     deps:
-    - path: data/experiments/predict/pl-court-instruct/open_ai_gpt-4o-mini/outputs_997.json
-      hash: md5
-      md5: 839c911f542cd7c60c9ae52ef95e9907
-      size: 1812429
-    - path: scripts/sft/evaluate_llm_as_judge.py
-      hash: md5
-      md5: 79a02fb864cb279f93fc4171043bb31c
-      size: 2253
+    - hash: md5
+      md5: 3933c4faf5a478d0f9d3963c3b29e5cc
+      path: configs/fine_tuning.yaml
+      size: 1356
+    - hash: md5
+      md5: ca5ac52e503c9f488f98f569811c76dc
+      path: configs/model/Unsloth-Mistral-Nemo-Instruct-2407.yaml
+      size: 261
+    - hash: md5
+      md5: 4b77ee1ea604cae18f17ca00cdb6988b
+      path: scripts/sft/fine_tune_llm.py
+      size: 4578
     outs:
-    - path: 
-        data/experiments/predict/pl-court-instruct/open_ai_gpt-4o-mini/judge_metrics_997.json
-      hash: md5
-      md5: 24037233e5abe74fe13f69dd4fc5e26a
-      size: 1173
-  evaluate_llm_as_judge_api_models@en-gpt_4o_mini-open_ai_gpt-4o-997:
-    cmd: PYTHONPATH=. python scripts/sft/evaluate_llm_as_judge.py api_model=gpt_4o_mini
-      answers_file=data/experiments/predict/en-court-instruct/open_ai_gpt-4o/outputs_997.json
-      out_metric_file=data/experiments/predict/en-court-instruct/open_ai_gpt-4o/judge_metrics_997.json
-      prompt=en
+    - hash: md5
+      md5: 80bceb56982e9bdb8d4b441bf843014f.dir
+      nfiles: 33
+      path: data/experiments/fine-tune/Unsloth-Mistral-Nemo-Instruct-2407/pl-court-instruct/
+      size: 1056899473
+  summarize_metrics@data/experiments/predict/en-court-instruct:
+    cmd: PYTHONPATH=. python scripts/sft/summarize_metrics.py --root-dir data/experiments/predict/en-court-instruct
     deps:
-    - path: data/experiments/predict/en-court-instruct/open_ai_gpt-4o/outputs_997.json
-      hash: md5
-      md5: 8f70e2baa0b0ae8a320577f5c8a60011
-      size: 679432
-    - path: scripts/sft/evaluate_llm_as_judge.py
-      hash: md5
-      md5: 79a02fb864cb279f93fc4171043bb31c
-      size: 2253
+    - hash: md5
+      md5: eb5736f5709f9773acf21bfc28c2e012
+      path: scripts/sft/summarize_metrics.py
+      size: 2975
     outs:
-    - path: data/experiments/predict/en-court-instruct/open_ai_gpt-4o/judge_metrics_997.json
-      hash: md5
-      md5: 1ad8736bed0fff4e88a9c32775f370bf
-      size: 481
-  evaluate_llm_as_judge_api_models@en-gpt_4o_mini-open_ai_gpt-4o-mini-997:
-    cmd: PYTHONPATH=. python scripts/sft/evaluate_llm_as_judge.py api_model=gpt_4o_mini
-      answers_file=data/experiments/predict/en-court-instruct/open_ai_gpt-4o-mini/outputs_997.json
-      out_metric_file=data/experiments/predict/en-court-instruct/open_ai_gpt-4o-mini/judge_metrics_997.json
-      prompt=en
+    - hash: md5
+      md5: 6065f2fbff28ab7439d35ddfe03b1938
+      path: data/experiments/predict/en-court-instruct/metrics_judge_summary.md
+      size: 4857
+    - hash: md5
+      md5: 1bb66cbd940bd2288f69fbe490465aaa
+      path: data/experiments/predict/en-court-instruct/metrics_ngram_summary.md
+      size: 1031
+  summarize_metrics@data/experiments/predict/pl-court-instruct:
+    cmd: PYTHONPATH=. python scripts/sft/summarize_metrics.py --root-dir data/experiments/predict/pl-court-instruct
     deps:
-    - path: data/experiments/predict/en-court-instruct/open_ai_gpt-4o-mini/outputs_997.json
-      hash: md5
-      md5: 2a0819011b3eac56e497201a9f67e310
-      size: 690306
-    - path: scripts/sft/evaluate_llm_as_judge.py
-      hash: md5
-      md5: 79a02fb864cb279f93fc4171043bb31c
-      size: 2253
+    - hash: md5
+      md5: eb5736f5709f9773acf21bfc28c2e012
+      path: scripts/sft/summarize_metrics.py
+      size: 2975
     outs:
-    - path: 
-        data/experiments/predict/en-court-instruct/open_ai_gpt-4o-mini/judge_metrics_997.json
-      hash: md5
-      md5: bd272bea099716c0c2e689a2d19c0071
-      size: 488
+    - hash: md5
+      md5: 3a94f7b7932b8404b88df0236a82ca4d
+      path: data/experiments/predict/pl-court-instruct/metrics_judge_summary.md
+      size: 14993
+    - hash: md5
+      md5: 1a3cc443d01b7a5a86657e686f3ea763
+      path: data/experiments/predict/pl-court-instruct/metrics_ngram_summary.md
+      size: 3263
diff --git a/dvc.yaml b/dvc.yaml
index a351a8e..4c286f7 100644
--- a/dvc.yaml
+++ b/dvc.yaml
@@ -113,7 +113,7 @@ stages:
       - data/experiments/fine-tune/${item.model}/${item.dataset}/
 
   ### Prediction ###
-  predict:
+  predict_pl:
     matrix:
       dataset:
         - pl-court-instruct
@@ -129,6 +129,7 @@ stages:
         - trurl-13B-academic
         - qra-13b
         - Bielik-11B-v2.2-Instruct
+        - Bielik-11B-v2.2-Instruct-fine-tuned
       seed: ${seeds}
     cmd: >-
       PYTHONPATH=. python scripts/sft/predict.py
@@ -189,7 +190,7 @@ stages:
       - data/experiments/predict/${item.dataset}/open_ai_${item.model}/outputs_${item.seed}.json
 
   ### Evaluation ###
-  evaluate:
+  evaluate_pl:
     matrix:
       dataset:
         - pl-court-instruct
@@ -204,6 +205,8 @@ stages:
         - Bielik-7B-Instruct-v0.1-fine-tuned
         - trurl-13B-academic
         - qra-13b
+        - Bielik-11B-v2.2-Instruct
+        - Bielik-11B-v2.2-Instruct-fine-tuned
       seed: ${seeds}
     cmd: >-
       PYTHONPATH=. python scripts/sft/evaluate.py
@@ -265,6 +268,8 @@ stages:
         - Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned
         - Bielik-7B-Instruct-v0.1
         - Bielik-7B-Instruct-v0.1-fine-tuned
+        - Bielik-11B-v2.2-Instruct
+        - Bielik-11B-v2.2-Instruct-fine-tuned
       seed: ${seeds}
     cmd: >-
       PYTHONPATH=. python scripts/sft/evaluate_llm_as_judge.py