Add results for gpt-4o and gpt-4o-mini

pwr-ai · Aug 25, 2024 · 28b2e77 · 28b2e77
1 parent 2aab5d0
commit 28b2e77
Show file tree

Hide file tree

Showing 5 changed files with 87 additions and 1 deletion.
diff --git a/data/experiments/predict/pl-court-instruct/metrics_ngram_summary.md b/data/experiments/predict/pl-court-instruct/metrics_ngram_summary.md
@@ -1,8 +1,10 @@
 | llm                                           | full_text_chrf   | court_name      | date            | department_name   | judges          | legal_bases     | recorder        | signature       |
 |:----------------------------------------------|:-----------------|:----------------|:----------------|:------------------|:----------------|:----------------|:----------------|:----------------|
-| Unsloth-Llama-3-8B-Instruct                   | 0.578 (± 0.000)  | 0.865 (± 0.000) | 0.947 (± 0.001) | 0.889 (± 0.032)   | 0.905 (± 0.014) | 0.323 (± 0.052) | 0.741 (± 0.002) | 0.672 (± 0.026) |
+| Unsloth-Llama-3-8B-Instruct                   | 0.579 (± 0.001)  | 0.865 (± 0.000) | 0.948 (± 0.001) | 0.882 (± 0.026)   | 0.902 (± 0.011) | 0.312 (± 0.042) | 0.741 (± 0.002) | 0.665 (± 0.022) |
 | Unsloth-Llama-3-8B-Instruct-fine-tuned        | 0.747 (± 0.000)  | 0.916 (± 0.001) | 0.920 (± 0.002) | 0.902 (± 0.000)   | 0.906 (± 0.001) | 0.442 (± 0.001) | 0.812 (± 0.003) | 0.805 (± 0.004) |
 | Unsloth-Mistral-7B-Instruct-v0.3              | 0.574 (± 0.001)  | 0.397 (± 0.005) | 0.470 (± 0.004) | 0.404 (± 0.005)   | 0.424 (± 0.003) | 0.159 (± 0.002) | 0.436 (± 0.003) | 0.159 (± 0.001) |
 | Unsloth-Mistral-7B-Instruct-v0.3-fine-tuned   | 0.634 (± 0.001)  | 0.547 (± 0.003) | 0.549 (± 0.003) | 0.543 (± 0.003)   | 0.544 (± 0.003) | 0.366 (± 0.002) | 0.534 (± 0.002) | 0.533 (± 0.001) |
 | Unsloth-Mistral-Nemo-Instruct-2407            | 0.520 (± 0.001)  | 0.732 (± 0.006) | 0.759 (± 0.005) | 0.687 (± 0.006)   | 0.619 (± 0.006) | 0.267 (± 0.002) | 0.690 (± 0.008) | 0.600 (± 0.004) |
 | Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned | 0.610 (± 0.000)  | 0.782 (± 0.002) | 0.742 (± 0.000) | 0.717 (± 0.002)   | 0.717 (± 0.001) | 0.368 (± 0.000) | 0.696 (± 0.003) | 0.650 (± 0.003) |
+| open_ai_gpt-4o                                | 0.651 (± nan)    | 0.955 (± nan)   | 0.986 (± nan)   | 0.971 (± nan)     | 0.917 (± nan)   | 0.502 (± nan)   | 0.834 (± nan)   | 0.990 (± nan)   |
+| open_ai_gpt-4o-mini                           | 0.646 (± nan)    | 0.953 (± nan)   | 0.986 (± nan)   | 0.976 (± nan)     | 0.927 (± nan)   | 0.534 (± nan)   | 0.969 (± nan)   | 0.988 (± nan)   |
diff --git a/data/experiments/predict/pl-court-instruct/open_ai_gpt-4o-mini/.gitignore b/data/experiments/predict/pl-court-instruct/open_ai_gpt-4o-mini/.gitignore
@@ -1 +1,2 @@
 /outputs_997.json
+/metrics_997.json
diff --git a/data/experiments/predict/pl-court-instruct/open_ai_gpt-4o/.gitignore b/data/experiments/predict/pl-court-instruct/open_ai_gpt-4o/.gitignore
@@ -0,0 +1,2 @@
+/outputs_997.json
+/metrics_997.json
diff --git a/dvc.lock b/dvc.lock
@@ -1561,3 +1561,81 @@ stages:
       hash: md5
       md5: 839c911f542cd7c60c9ae52ef95e9907
       size: 1812429
+  evaluate@open_ai_gpt-4o-mini-997:
+    cmd: PYTHONPATH=. python scripts/sft/evaluate.py --output-file data/experiments/predict/pl-court-instruct/open_ai_gpt-4o-mini/outputs_997.json
+    deps:
+    - path: data/experiments/predict/pl-court-instruct/open_ai_gpt-4o-mini/outputs_997.json
+      hash: md5
+      md5: 839c911f542cd7c60c9ae52ef95e9907
+      size: 1812429
+    - path: scripts/sft/evaluate.py
+      hash: md5
+      md5: 73aa4a7eb8a035c087702457b9401654
+      size: 636
+    outs:
+    - path: data/experiments/predict/pl-court-instruct/open_ai_gpt-4o-mini/metrics_997.json
+      hash: md5
+      md5: fe43f0d25b500a0f2fb2d8199b8034fd
+      size: 305
+  [email protected]:
+    cmd: PYTHONPATH=. python scripts/sft/predict.py model=Bielik-7B-Instruct-v0.1
+      random_seed=42
+      output_file=data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1/outputs_42.json
+    deps:
+    - path: configs/model/Bielik-7B-Instruct-v0.1.yaml
+      hash: md5
+      md5: c3412525e9819b53fbad06363a07a871
+      size: 173
+    - path: configs/predict.yaml
+      hash: md5
+      md5: 5fc8b9ac571d4a2209d7d866697252ab
+      size: 402
+    - path: scripts/sft/predict.py
+      hash: md5
+      md5: f9acd63cd4d682ae2242d7b51f0d974b
+      size: 3198
+    outs:
+    - path:
+        data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1/outputs_42.json
+      hash: md5
+      md5: 2dc39513a04910c5d0c54380166639d9
+      size: 2029644
+  [email protected]:
+    cmd: PYTHONPATH=. python scripts/sft/predict.py model=Bielik-7B-Instruct-v0.1
+      random_seed=7312
+      output_file=data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1/outputs_7312.json
+    deps:
+    - path: configs/model/Bielik-7B-Instruct-v0.1.yaml
+      hash: md5
+      md5: c3412525e9819b53fbad06363a07a871
+      size: 173
+    - path: configs/predict.yaml
+      hash: md5
+      md5: 5fc8b9ac571d4a2209d7d866697252ab
+      size: 402
+    - path: scripts/sft/predict.py
+      hash: md5
+      md5: f9acd63cd4d682ae2242d7b51f0d974b
+      size: 3198
+    outs:
+    - path:
+        data/experiments/predict/pl-court-instruct/Bielik-7B-Instruct-v0.1/outputs_7312.json
+      hash: md5
+      md5: ae39bf31296ffe82c0f6a3e8c9ff63aa
+      size: 2014399
+  evaluate@open_ai_gpt-4o-997:
+    cmd: PYTHONPATH=. python scripts/sft/evaluate.py --output-file data/experiments/predict/pl-court-instruct/open_ai_gpt-4o/outputs_997.json
+    deps:
+    - path: data/experiments/predict/pl-court-instruct/open_ai_gpt-4o/outputs_997.json
+      hash: md5
+      md5: 7c5833fdd1419163b286baaa3d71e084
+      size: 1965252
+    - path: scripts/sft/evaluate.py
+      hash: md5
+      md5: 73aa4a7eb8a035c087702457b9401654
+      size: 636
+    outs:
+    - path: data/experiments/predict/pl-court-instruct/open_ai_gpt-4o/metrics_997.json
+      hash: md5
+      md5: 65c808d4aebd8efe37b94a5128a19de6
+      size: 306
diff --git a/dvc.yaml b/dvc.yaml
@@ -123,6 +123,7 @@ stages:
   predict_with_api:
     matrix:
       model:
+        - gpt-4o
         - gpt-4o-mini
       seed:
         - 997
@@ -146,6 +147,8 @@ stages:
         - Unsloth-Mistral-7B-Instruct-v0.3-fine-tuned
         - Unsloth-Mistral-Nemo-Instruct-2407
         - Unsloth-Mistral-Nemo-Instruct-2407-fine-tuned
+        - open_ai_gpt-4o
+        - open_ai_gpt-4o-mini
       seed: ${seeds}
     cmd: >-
       PYTHONPATH=. python scripts/sft/evaluate.py