From 7d242381c0aeca89a2bea94c4c849ddd2a4bec35 Mon Sep 17 00:00:00 2001 From: Baber Abbasi <92168766+baberabb@users.noreply.github.com> Date: Thu, 26 Sep 2024 23:57:39 +0500 Subject: [PATCH] change glianorex to test split (#2332) * change glianorex to test set * nit * fix test; doc_to_target can be str for multiple_choice * nit --- lm_eval/tasks/glianorex/README.md | 5 +++++ lm_eval/tasks/glianorex/glianorex.yaml | 4 +++- lm_eval/tasks/glianorex/glianorex_en.yaml | 4 +++- lm_eval/tasks/glianorex/glianorex_fr.yaml | 4 +++- lm_eval/tasks/glianorex/preprocess_glianorex.py | 3 ++- tests/test_tasks.py | 6 +++++- 6 files changed, 21 insertions(+), 5 deletions(-) diff --git a/lm_eval/tasks/glianorex/README.md b/lm_eval/tasks/glianorex/README.md index 3efc925665..cff102a897 100644 --- a/lm_eval/tasks/glianorex/README.md +++ b/lm_eval/tasks/glianorex/README.md @@ -18,3 +18,8 @@ All tasks are multiple choice questions with 4 options, only one correct option. - `glianorex_en`: Evaluates the accuracy on 264 questions in English. - `glianorex_fr`: Evaluates the accuracy on 264 questions in French. + +#### Change Log + +* (all tasks) 2024-09-23 -- 1.0 + * Switched the `test_split` from `train` to `test`. diff --git a/lm_eval/tasks/glianorex/glianorex.yaml b/lm_eval/tasks/glianorex/glianorex.yaml index a7ba436656..b1fdb23689 100755 --- a/lm_eval/tasks/glianorex/glianorex.yaml +++ b/lm_eval/tasks/glianorex/glianorex.yaml @@ -1,7 +1,7 @@ task: glianorex dataset_path: maximegmd/glianorex output_type: multiple_choice -test_split: train +test_split: test doc_to_text: !function preprocess_glianorex.doc_to_text doc_to_target: !function preprocess_glianorex.doc_to_target doc_to_choice: [ 'A', 'B', 'C', 'D' ] @@ -12,3 +12,5 @@ metric_list: - metric: acc_norm aggregation: mean higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/glianorex/glianorex_en.yaml b/lm_eval/tasks/glianorex/glianorex_en.yaml index b08c6f8114..d1be3d18cf 100755 --- a/lm_eval/tasks/glianorex/glianorex_en.yaml +++ b/lm_eval/tasks/glianorex/glianorex_en.yaml @@ -1,7 +1,7 @@ task: glianorex_en dataset_path: maximegmd/glianorex output_type: multiple_choice -test_split: train +test_split: test doc_to_text: !function preprocess_glianorex.doc_to_text doc_to_target: !function preprocess_glianorex.doc_to_target process_docs: !function preprocess_glianorex.filter_english @@ -13,3 +13,5 @@ metric_list: - metric: acc_norm aggregation: mean higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/glianorex/glianorex_fr.yaml b/lm_eval/tasks/glianorex/glianorex_fr.yaml index 6d09bc5a7b..8a28943092 100755 --- a/lm_eval/tasks/glianorex/glianorex_fr.yaml +++ b/lm_eval/tasks/glianorex/glianorex_fr.yaml @@ -1,7 +1,7 @@ task: glianorex_fr dataset_path: maximegmd/glianorex output_type: multiple_choice -test_split: train +test_split: test doc_to_text: !function preprocess_glianorex.doc_to_text doc_to_target: !function preprocess_glianorex.doc_to_target process_docs: !function preprocess_glianorex.filter_french @@ -13,3 +13,5 @@ metric_list: - metric: acc_norm aggregation: mean higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/glianorex/preprocess_glianorex.py b/lm_eval/tasks/glianorex/preprocess_glianorex.py index f257df14d8..9a70dfd5a7 100755 --- a/lm_eval/tasks/glianorex/preprocess_glianorex.py +++ b/lm_eval/tasks/glianorex/preprocess_glianorex.py @@ -7,7 +7,8 @@ def doc_to_text(doc) -> str: return f"Question: {doc['question']}\n{answers}Answer:" -def doc_to_target(doc) -> int: +def doc_to_target(doc) -> str: + # answer_idx is `A`, `B`, `C`, `D` etc. return doc["answer_idx"] diff --git a/tests/test_tasks.py b/tests/test_tasks.py index c9453d5497..0dc514a2cf 100644 --- a/tests/test_tasks.py +++ b/tests/test_tasks.py @@ -101,7 +101,11 @@ def test_doc_to_target(self, task_class, limit): ) _array_target = [task.doc_to_target(doc) for doc in arr] if task._config.output_type == "multiple_choice": - assert all(isinstance(label, int) for label in _array_target) + # TODO: label can be string or int; add better test conditions + assert all( + (isinstance(label, int) or isinstance(label, str)) + for label in _array_target + ) def test_build_all_requests(self, task_class, limit): task_class.build_all_requests(rank=1, limit=limit, world_size=1)