From 7d242381c0aeca89a2bea94c4c849ddd2a4bec35 Mon Sep 17 00:00:00 2001
From: Baber Abbasi <92168766+baberabb@users.noreply.github.com>
Date: Thu, 26 Sep 2024 23:57:39 +0500
Subject: [PATCH] change glianorex to test split (#2332)

* change glianorex to test set

* nit

* fix test; doc_to_target can be str for multiple_choice

* nit
---
 lm_eval/tasks/glianorex/README.md               | 5 +++++
 lm_eval/tasks/glianorex/glianorex.yaml          | 4 +++-
 lm_eval/tasks/glianorex/glianorex_en.yaml       | 4 +++-
 lm_eval/tasks/glianorex/glianorex_fr.yaml       | 4 +++-
 lm_eval/tasks/glianorex/preprocess_glianorex.py | 3 ++-
 tests/test_tasks.py                             | 6 +++++-
 6 files changed, 21 insertions(+), 5 deletions(-)

diff --git a/lm_eval/tasks/glianorex/README.md b/lm_eval/tasks/glianorex/README.md
index 3efc925665..cff102a897 100644
--- a/lm_eval/tasks/glianorex/README.md
+++ b/lm_eval/tasks/glianorex/README.md
@@ -18,3 +18,8 @@ All tasks are multiple choice questions with 4 options, only one correct option.
 
 - `glianorex_en`: Evaluates the accuracy on 264 questions in English.
 - `glianorex_fr`: Evaluates the accuracy on 264 questions in French.
+
+#### Change Log
+
+* (all tasks) 2024-09-23 -- 1.0
+  * Switched the `test_split` from `train` to `test`.
diff --git a/lm_eval/tasks/glianorex/glianorex.yaml b/lm_eval/tasks/glianorex/glianorex.yaml
index a7ba436656..b1fdb23689 100755
--- a/lm_eval/tasks/glianorex/glianorex.yaml
+++ b/lm_eval/tasks/glianorex/glianorex.yaml
@@ -1,7 +1,7 @@
 task: glianorex
 dataset_path: maximegmd/glianorex
 output_type: multiple_choice
-test_split: train
+test_split: test
 doc_to_text: !function preprocess_glianorex.doc_to_text
 doc_to_target: !function preprocess_glianorex.doc_to_target
 doc_to_choice: [ 'A', 'B', 'C', 'D' ]
@@ -12,3 +12,5 @@ metric_list:
   - metric: acc_norm
     aggregation: mean
     higher_is_better: true
+metadata:
+  version: 1.0
diff --git a/lm_eval/tasks/glianorex/glianorex_en.yaml b/lm_eval/tasks/glianorex/glianorex_en.yaml
index b08c6f8114..d1be3d18cf 100755
--- a/lm_eval/tasks/glianorex/glianorex_en.yaml
+++ b/lm_eval/tasks/glianorex/glianorex_en.yaml
@@ -1,7 +1,7 @@
 task: glianorex_en
 dataset_path: maximegmd/glianorex
 output_type: multiple_choice
-test_split: train
+test_split: test
 doc_to_text: !function preprocess_glianorex.doc_to_text
 doc_to_target: !function preprocess_glianorex.doc_to_target
 process_docs: !function preprocess_glianorex.filter_english
@@ -13,3 +13,5 @@ metric_list:
   - metric: acc_norm
     aggregation: mean
     higher_is_better: true
+metadata:
+  version: 1.0
diff --git a/lm_eval/tasks/glianorex/glianorex_fr.yaml b/lm_eval/tasks/glianorex/glianorex_fr.yaml
index 6d09bc5a7b..8a28943092 100755
--- a/lm_eval/tasks/glianorex/glianorex_fr.yaml
+++ b/lm_eval/tasks/glianorex/glianorex_fr.yaml
@@ -1,7 +1,7 @@
 task: glianorex_fr
 dataset_path: maximegmd/glianorex
 output_type: multiple_choice
-test_split: train
+test_split: test
 doc_to_text: !function preprocess_glianorex.doc_to_text
 doc_to_target: !function preprocess_glianorex.doc_to_target
 process_docs: !function preprocess_glianorex.filter_french
@@ -13,3 +13,5 @@ metric_list:
   - metric: acc_norm
     aggregation: mean
     higher_is_better: true
+metadata:
+  version: 1.0
diff --git a/lm_eval/tasks/glianorex/preprocess_glianorex.py b/lm_eval/tasks/glianorex/preprocess_glianorex.py
index f257df14d8..9a70dfd5a7 100755
--- a/lm_eval/tasks/glianorex/preprocess_glianorex.py
+++ b/lm_eval/tasks/glianorex/preprocess_glianorex.py
@@ -7,7 +7,8 @@ def doc_to_text(doc) -> str:
     return f"Question: {doc['question']}\n{answers}Answer:"
 
 
-def doc_to_target(doc) -> int:
+def doc_to_target(doc) -> str:
+    # answer_idx is `A`, `B`, `C`, `D` etc.
     return doc["answer_idx"]
 
 
diff --git a/tests/test_tasks.py b/tests/test_tasks.py
index c9453d5497..0dc514a2cf 100644
--- a/tests/test_tasks.py
+++ b/tests/test_tasks.py
@@ -101,7 +101,11 @@ def test_doc_to_target(self, task_class, limit):
         )
         _array_target = [task.doc_to_target(doc) for doc in arr]
         if task._config.output_type == "multiple_choice":
-            assert all(isinstance(label, int) for label in _array_target)
+            # TODO<baber>: label can be string or int; add better test conditions
+            assert all(
+                (isinstance(label, int) or isinstance(label, str))
+                for label in _array_target
+            )
 
     def test_build_all_requests(self, task_class, limit):
         task_class.build_all_requests(rank=1, limit=limit, world_size=1)