diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/_mmlu.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/_mmlu.yaml index 28ed66b21b..c6c1c6a19d 100644 --- a/lm_eval/tasks/mmlu/flan_cot_fewshot/_mmlu.yaml +++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/_mmlu.yaml @@ -26,7 +26,9 @@ task: - metric: acc weight_by_size: True aggregate_metric_list: - - metric: acc + - aggregation: mean + metric: exact_match weight_by_size: True + filter_list: get-answer metadata: version: 2 diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/_mmlu_flan_cot_fewshot_template_yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/_mmlu_flan_cot_fewshot_template_yaml index be1ede3f69..cfbf222e5b 100644 --- a/lm_eval/tasks/mmlu/flan_cot_fewshot/_mmlu_flan_cot_fewshot_template_yaml +++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/_mmlu_flan_cot_fewshot_template_yaml @@ -1,12 +1,11 @@ dataset_path: hails/mmlu_no_train # a copy of `cais/mmlu` with no auxiliary_train split validation_split: validation test_split: test -fewshot_split: dev fewshot_config: sampler: first_n output_type: generate_until -doc_to_text: "Q: {{question.strip()}}\n(A) {{choices[0]}} (B) {{choices[1]}} (C) {{choices[2]}} (D) {{choices[3]}}\nA: Let's think step by step." -doc_to_target: "{{['(A)', '(B)', '(C)', '(D)'][answer]}}" +doc_to_text: "{% if choices is defined%}Q: {{question.strip()}}\n(A) {{choices[0]}} (B) {{choices[1]}} (C) {{choices[2]}} (D) {{choices[3]}}\nA: Let's think step by step.{% else %}Q: {{ question.strip() }}\nA:{% endif %}" +doc_to_target: "{{['(A)', '(B)', '(C)', '(D)'][answer] if answer is defined else target}}" filter_list: - name: "get-answer" filter: @@ -18,7 +17,7 @@ generation_kwargs: - "" do_sample: false temperature: 0.0 -num_fewshot: 0 +num_fewshot: 4 metric_list: - metric: exact_match aggregation: mean diff --git a/lm_eval/tasks/mmlu/flan_n_shot/generative/_mmlu_flan_generative_template_yaml b/lm_eval/tasks/mmlu/flan_n_shot/generative/_mmlu_flan_generative_template_yaml index 76944383ba..a38a06969e 100644 --- a/lm_eval/tasks/mmlu/flan_n_shot/generative/_mmlu_flan_generative_template_yaml +++ b/lm_eval/tasks/mmlu/flan_n_shot/generative/_mmlu_flan_generative_template_yaml @@ -12,7 +12,7 @@ filter_list: - function: "take_first" - name: "flexible-extract" filter: - - function: !function utils.MultiChoiceRegexFilter + - function: "multi_choice_regex" group_select: 0 regex_pattern: "(\\([A-Z]\\))" ignore_case: true