From 087139a4a3f29af4fbb3d6081a2c50c1ed913548 Mon Sep 17 00:00:00 2001 From: zxcvuser Date: Fri, 19 Jul 2024 14:20:00 +0200 Subject: [PATCH 1/5] Add galician_bench --- lm_eval/tasks/galician_bench/README.md | 80 ++++++ .../galician_bench/belebele_glg_Latn.yaml | 9 + .../flores_gl/_flores_common_yaml | 26 ++ .../flores_gl/create-yamls_flores_gl.py | 115 ++++++++ .../flores_gl/flores_ca-gl.yaml | 8 + .../flores_gl/flores_de-gl.yaml | 8 + .../flores_gl/flores_en-gl.yaml | 8 + .../flores_gl/flores_es-gl.yaml | 8 + .../flores_gl/flores_eu-gl.yaml | 8 + .../flores_gl/flores_fr-gl.yaml | 8 + .../flores_gl/flores_gl-ca.yaml | 8 + .../flores_gl/flores_gl-de.yaml | 8 + .../flores_gl/flores_gl-en.yaml | 8 + .../flores_gl/flores_gl-es.yaml | 8 + .../flores_gl/flores_gl-eu.yaml | 8 + .../flores_gl/flores_gl-fr.yaml | 8 + .../flores_gl/flores_gl-it.yaml | 8 + .../flores_gl/flores_gl-pt.yaml | 8 + .../flores_gl/flores_it-gl.yaml | 8 + .../flores_gl/flores_pt-gl.yaml | 8 + lm_eval/tasks/galician_bench/galcola.yaml | 16 ++ .../tasks/galician_bench/galician_bench.yaml | 15 + .../tasks/galician_bench/mgsm_direct_gl.yaml | 27 ++ .../tasks/galician_bench/openbookqa_gl.yaml | 21 ++ .../tasks/galician_bench/parafrases_gl.yaml | 18 ++ lm_eval/tasks/galician_bench/paws_gl.yaml | 20 ++ .../galician_bench/summarization_gl.yaml | 21 ++ .../galician_bench/truthfulqa_gl_gen.yaml | 70 +++++ .../galician_bench/truthfulqa_gl_mc1.yaml | 36 +++ .../galician_bench/truthfulqa_gl_mc2.yaml | 15 + lm_eval/tasks/galician_bench/utils.py | 262 ++++++++++++++++++ lm_eval/tasks/galician_bench/xnli_gl.yaml | 22 ++ .../tasks/galician_bench/xstorycloze_gl.yaml | 16 ++ 33 files changed, 917 insertions(+) create mode 100644 lm_eval/tasks/galician_bench/README.md create mode 100644 lm_eval/tasks/galician_bench/belebele_glg_Latn.yaml create mode 100644 lm_eval/tasks/galician_bench/flores_gl/_flores_common_yaml create mode 100644 lm_eval/tasks/galician_bench/flores_gl/create-yamls_flores_gl.py create mode 100644 lm_eval/tasks/galician_bench/flores_gl/flores_ca-gl.yaml create mode 100644 lm_eval/tasks/galician_bench/flores_gl/flores_de-gl.yaml create mode 100644 lm_eval/tasks/galician_bench/flores_gl/flores_en-gl.yaml create mode 100644 lm_eval/tasks/galician_bench/flores_gl/flores_es-gl.yaml create mode 100644 lm_eval/tasks/galician_bench/flores_gl/flores_eu-gl.yaml create mode 100644 lm_eval/tasks/galician_bench/flores_gl/flores_fr-gl.yaml create mode 100644 lm_eval/tasks/galician_bench/flores_gl/flores_gl-ca.yaml create mode 100644 lm_eval/tasks/galician_bench/flores_gl/flores_gl-de.yaml create mode 100644 lm_eval/tasks/galician_bench/flores_gl/flores_gl-en.yaml create mode 100644 lm_eval/tasks/galician_bench/flores_gl/flores_gl-es.yaml create mode 100644 lm_eval/tasks/galician_bench/flores_gl/flores_gl-eu.yaml create mode 100644 lm_eval/tasks/galician_bench/flores_gl/flores_gl-fr.yaml create mode 100644 lm_eval/tasks/galician_bench/flores_gl/flores_gl-it.yaml create mode 100644 lm_eval/tasks/galician_bench/flores_gl/flores_gl-pt.yaml create mode 100644 lm_eval/tasks/galician_bench/flores_gl/flores_it-gl.yaml create mode 100644 lm_eval/tasks/galician_bench/flores_gl/flores_pt-gl.yaml create mode 100644 lm_eval/tasks/galician_bench/galcola.yaml create mode 100644 lm_eval/tasks/galician_bench/galician_bench.yaml create mode 100644 lm_eval/tasks/galician_bench/mgsm_direct_gl.yaml create mode 100644 lm_eval/tasks/galician_bench/openbookqa_gl.yaml create mode 100644 lm_eval/tasks/galician_bench/parafrases_gl.yaml create mode 100644 lm_eval/tasks/galician_bench/paws_gl.yaml create mode 100644 lm_eval/tasks/galician_bench/summarization_gl.yaml create mode 100644 lm_eval/tasks/galician_bench/truthfulqa_gl_gen.yaml create mode 100644 lm_eval/tasks/galician_bench/truthfulqa_gl_mc1.yaml create mode 100644 lm_eval/tasks/galician_bench/truthfulqa_gl_mc2.yaml create mode 100644 lm_eval/tasks/galician_bench/utils.py create mode 100644 lm_eval/tasks/galician_bench/xnli_gl.yaml create mode 100644 lm_eval/tasks/galician_bench/xstorycloze_gl.yaml diff --git a/lm_eval/tasks/galician_bench/README.md b/lm_eval/tasks/galician_bench/README.md new file mode 100644 index 0000000000..24dd7a7435 --- /dev/null +++ b/lm_eval/tasks/galician_bench/README.md @@ -0,0 +1,80 @@ +# GalicianBench + +### Paper + +GalicianBench is a benchmark for evaluating language models in Galician tasks. This is, it evaluates the ability of a language model to understand and generate Galician text. GalicianBench offers a combination of pre-existing, open datasets and datasets developed exclusivelly for this benchmark. All the details of GalicianBench will be published in a paper soon. + +The new evaluation datasets included in GalicianBench are: +| Task | Category | Homepage | +|:-------------:|:-----:|:-----:| +| Belebele_gl | Reading Comprehension | https://huggingface.co/datasets/proxectonos/belebele_gl | +| GalCoLA | Linguistic Acceptability | https://huggingface.co/datasets/proxectonos/galcola | +| MGSM_ca | Math | https://huggingface.co/datasets/proxectonos/mgsm_gl | +| Parafrases_gl | Paraphrasing | https://huggingface.co/datasets/proxectonos/parafrases_gl | +| PAWS-gl | Paraphrasing | https://huggingface.co/datasets/proxectonos/PAWS-gl | +| OpenBookQA_gl | Question Answering | https://huggingface.co/datasets/proxectonos/openbookqa_gl | +| Summarization_gl | Summarization | https://huggingface.co/datasets/proxectonos/summarization_gl | +| TruthfulQA_gl | Truthfulness | https://huggingface.co/datasets/proxectonos/truthfulqa_gl | +| xnli_gl | NLI | https://huggingface.co/datasets/proxectonos/xnli_gl | +| xstorycloze_gl | Commonsense Reasoning | https://huggingface.co/datasets/proxectonos/xstorycloze_gl | + +The datasets included in GalicianBench that have been made public in previous pubications are: + +| Task | Category | Paper title | Homepage | +|:-------------:|:-----:|:-------------:|:-----:| +| FLORES_gl | Translation | [The FLORES-101 Evaluation Benchmark for Low-Resource and Multilingual Machine Translation](https://arxiv.org/abs/2106.03193) | https://huggingface.co/datasets/facebook/flores | + + +### Citation +Paper for GalicianBench coming soon. + +### Groups and Tasks + +#### Groups + +- `galician_bench`: All tasks included in GalicianBench. +- `flores_gl`: All FLORES translation tasks from or to Galician. + + +#### Tasks + +The following tasks evaluate tasks on GalicianBench dataset using various scoring methods. + - `belebele_glg_Latn` + - `flores_gl` + - `flores_gl-ca` + - `flores_gl-de` + - `flores_gl-en` + - `flores_gl-es` + - `flores_gl-eu` + - `flores_gl-fr` + - `flores_gl-it` + - `flores_gl-pt` + - `flores_ca-gl` + - `flores_de-gl` + - `flores_en-gl` + - `flores_es-gl` + - `flores_eu-gl` + - `flores_fr-gl` + - `flores_it-gl` + - `flores_pt-gl` + - `galcola` + - `summarization_gl` + - `parafrases_gl` + - `paws_gl` + - `openbookqa_gl` + - `mgsm_direct_gl` + - `truthfulqa_gl` + - `xnli_gl` + - `xstorycloze_gl` + +### Checklist + +* [x] Is the task an existing benchmark in the literature? + * [ ] Have you referenced the original paper that introduced the task? + * [ ] If yes, does the original paper provide a reference implementation? + * [ ] Yes, original implementation contributed by author of the benchmark + +If other tasks on this dataset are already supported: +* [ ] Is the "Main" variant of this task clearly denoted? +* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates? +* [ ] Have you noted which, if any, published evaluation setups are matched by this variant? diff --git a/lm_eval/tasks/galician_bench/belebele_glg_Latn.yaml b/lm_eval/tasks/galician_bench/belebele_glg_Latn.yaml new file mode 100644 index 0000000000..80aea572d7 --- /dev/null +++ b/lm_eval/tasks/galician_bench/belebele_glg_Latn.yaml @@ -0,0 +1,9 @@ +group: + - belebele +task: belebele_glg_Latn +include: ../belebele/_default_template_yaml +dataset_path: proxectonos/belebele_gl +fewshot_split: train +test_split: train +metadata: + version: 1.0 diff --git a/lm_eval/tasks/galician_bench/flores_gl/_flores_common_yaml b/lm_eval/tasks/galician_bench/flores_gl/_flores_common_yaml new file mode 100644 index 0000000000..e7b7ea6bac --- /dev/null +++ b/lm_eval/tasks/galician_bench/flores_gl/_flores_common_yaml @@ -0,0 +1,26 @@ +group: flores +dataset_path: facebook/flores +dataset_name: all +output_type: generate_until +#! The test split of flores is not publicly available! (See paper section 6.1) +#! We are using `dev` and `devtest` splits, but they're mapped to train/validation/test in `data/flores/flores.py`. +training_split: dev +validation_split: dev +test_split: devtest +fewshot_split: dev +target_delimiter: '' +generation_kwargs: + until: + - "\n" +metric_list: + - metric: bleu + aggregation: bleu + higher_is_better: true + - metric: ter + aggregation: ter + higher_is_better: false + - metric: chrf + aggregation: chrf + higher_is_better: true +metadata: + version: 1.0 \ No newline at end of file diff --git a/lm_eval/tasks/galician_bench/flores_gl/create-yamls_flores_gl.py b/lm_eval/tasks/galician_bench/flores_gl/create-yamls_flores_gl.py new file mode 100644 index 0000000000..7d805de62a --- /dev/null +++ b/lm_eval/tasks/galician_bench/flores_gl/create-yamls_flores_gl.py @@ -0,0 +1,115 @@ +""" +Script to generate task YAMLs for the FLORES-200 dataset. +Based on `tasks/translation/utils.py`. +""" + +import argparse +import yaml +from langcodes import * +from itertools import * + +# utils +flatten = lambda l: list(itertools.chain(*l)) + +# constants +_LANGUAGES = [ +"ace_Arab", "bam_Latn", "dzo_Tibt", "hin_Deva", "khm_Khmr", "mag_Deva", "pap_Latn", "sot_Latn", "tur_Latn", +"ace_Latn", "ban_Latn", "ell_Grek", "hne_Deva", "kik_Latn", "mai_Deva", "pbt_Arab", "spa_Latn", "twi_Latn", +"acm_Arab", "bel_Cyrl", "eng_Latn", "hrv_Latn", "kin_Latn", "mal_Mlym", "pes_Arab", "srd_Latn", "tzm_Tfng", +"acq_Arab", "bem_Latn", "epo_Latn", "hun_Latn", "kir_Cyrl", "mar_Deva", "plt_Latn", "srp_Cyrl", "uig_Arab", +"aeb_Arab", "ben_Beng", "est_Latn", "hye_Armn", "kmb_Latn", "min_Arab", "pol_Latn", "ssw_Latn", "ukr_Cyrl", +"afr_Latn", "bho_Deva", "eus_Latn", "ibo_Latn", "kmr_Latn", "min_Latn", "por_Latn", "sun_Latn", "umb_Latn", +"ajp_Arab", "bjn_Arab", "ewe_Latn", "ilo_Latn", "knc_Arab", "mkd_Cyrl", "prs_Arab", "swe_Latn", "urd_Arab", +"aka_Latn", "bjn_Latn", "fao_Latn", "ind_Latn", "knc_Latn", "mlt_Latn", "quy_Latn", "swh_Latn", "uzn_Latn", +"als_Latn", "bod_Tibt", "fij_Latn", "isl_Latn", "kon_Latn", "mni_Beng", "ron_Latn", "szl_Latn", "vec_Latn", +"amh_Ethi", "bos_Latn", "fin_Latn", "ita_Latn", "kor_Hang", "mos_Latn", "run_Latn", "tam_Taml", "vie_Latn", +"apc_Arab", "bug_Latn", "fon_Latn", "jav_Latn", "lao_Laoo", "mri_Latn", "rus_Cyrl", "taq_Latn", "war_Latn", +"arb_Arab", "bul_Cyrl", "fra_Latn", "jpn_Jpan", "lij_Latn", "mya_Mymr", "sag_Latn", "taq_Tfng", "wol_Latn", +"arb_Latn", "cat_Latn", "fur_Latn", "kab_Latn", "lim_Latn", "nld_Latn", "san_Deva", "tat_Cyrl", "xho_Latn", +"ars_Arab", "ceb_Latn", "fuv_Latn", "kac_Latn", "lin_Latn", "nno_Latn", "sat_Olck", "tel_Telu", "ydd_Hebr", +"ary_Arab", "ces_Latn", "gaz_Latn", "kam_Latn", "lit_Latn", "nob_Latn", "scn_Latn", "tgk_Cyrl", "yor_Latn", +"arz_Arab", "cjk_Latn", "gla_Latn", "kan_Knda", "lmo_Latn", "npi_Deva", "shn_Mymr", "tgl_Latn", "yue_Hant", +"asm_Beng", "ckb_Arab", "gle_Latn", "kas_Arab", "ltg_Latn", "nso_Latn", "sin_Sinh", "tha_Thai", "zho_Hans", +"ast_Latn", "crh_Latn", "glg_Latn", "kas_Deva", "ltz_Latn", "nus_Latn", "slk_Latn", "tir_Ethi", "zho_Hant", +"awa_Deva", "cym_Latn", "grn_Latn", "kat_Geor", "lua_Latn", "nya_Latn", "slv_Latn", "tpi_Latn", "zsm_Latn", +"ayr_Latn", "dan_Latn", "guj_Gujr", "kaz_Cyrl", "lug_Latn", "oci_Latn", "smo_Latn", "tsn_Latn", "zul_Latn", +"azb_Arab", "deu_Latn", "hat_Latn", "kbp_Latn", "luo_Latn", "ory_Orya", "sna_Latn", "tso_Latn", +"azj_Latn", "dik_Latn", "hau_Latn", "kea_Latn", "lus_Latn", "pag_Latn", "snd_Arab", "tuk_Latn", +"bak_Cyrl", "dyu_Latn", "heb_Hebr", "khk_Cyrl", "lvs_Latn", "pan_Guru", "som_Latn", "tum_Latn" +] +LANGUAGE_PAIRS = [(a, b) for idx, a in enumerate(_LANGUAGES) for b in _LANGUAGES[idx + 1:]] + +LANGUAGES_OF_INTEREST = ["cat_Latn", "spa_Latn", "eng_Latn", "glg_Latn", "eus_Latn", "ita_Latn", "deu_Latn", "por_Latn", "fra_Latn"] +MAIN_LANG = "glg_Latn" +LANGUAGE_PAIRS = [(a, b) for (a, b) in LANGUAGE_PAIRS if a in LANGUAGES_OF_INTEREST and b in LANGUAGES_OF_INTEREST and MAIN_LANG in (a, b)] + +# auxiliary functions + +code_to_language_name = lambda code: Language.make(language=Language.get(code)["language"]).display_name() +code_to_short_name = lambda code: Language.get(code)["language"] +jinja_var = lambda s: "{{" + s + "}}" # wrapper to avoid having to escape { } in format strings + +def doc_to_text(src: str, tgt: str) -> str: + src_name, tgt_name = map(code_to_language_name, [src, tgt]) + + return f"""\ +{src_name} sentence: {jinja_var('sentence_' + src)} +{tgt_name} sentence:""" + +def doc_to_target(tgt: str) -> str: + + return f"{jinja_var('sentence_' + tgt)}" + +# main function + +def gen_lang_yamls(output_dir: str, overwrite: bool) -> None: + """ + Generate a YAML file for each translation direction. + """ + + err = [] + for src, tgt in LANGUAGE_PAIRS: + + # do both translation directions for each lang pair + for src, tgt in [(src, tgt), (tgt, src)]: + lang_pair_name = f"{code_to_short_name(src)}-{code_to_short_name(tgt)}" + yaml_file_name = f"flores_{lang_pair_name}.yaml" + + try: + with open( f"{output_dir}/{yaml_file_name}", "w" if overwrite else "x", encoding="utf-8") as outfile: + print(f"Creating {yaml_file_name}...") + outfile.write("# File generated by `create-yamls.py`\n") + yaml.dump( + { +# "group": [f"{BENCH_NAME}_bench", f"{BENCH_NAME}_bench_flores"], + "group": "flores_gl", + "include": "_flores_common_yaml", + "task": f"flores_{lang_pair_name}", + "doc_to_text": doc_to_text(src, tgt), + "doc_to_target": doc_to_target(tgt), + }, + outfile, + sort_keys=False, + ) + + except FileExistsError: + err.append(yaml_file_name) + + if len(err) > 0: + raise FileExistsError( + "Files were not created because they already exist:" + f" {', '.join(err)}" + "\nUse flag --overwrite to overwrite them." + ) + + +def main() -> None: + parser = argparse.ArgumentParser() + parser.add_argument("--overwrite", default=False, action="store_true", help="Overwrite files if they already exist") + parser.add_argument( "--output-dir", default=".", help="Directory to write yaml files to" ) + args = parser.parse_args() + + gen_lang_yamls(output_dir=args.output_dir, overwrite=args.overwrite) + +if __name__ == "__main__": + main() diff --git a/lm_eval/tasks/galician_bench/flores_gl/flores_ca-gl.yaml b/lm_eval/tasks/galician_bench/flores_gl/flores_ca-gl.yaml new file mode 100644 index 0000000000..fc237527d1 --- /dev/null +++ b/lm_eval/tasks/galician_bench/flores_gl/flores_ca-gl.yaml @@ -0,0 +1,8 @@ +# File generated by `create-yamls.py` +group: flores_gl +include: _flores_common_yaml +task: flores_ca-gl +doc_to_text: 'Catalan sentence: {{sentence_cat_Latn}} + + Galician sentence:' +doc_to_target: '{{sentence_glg_Latn}}' diff --git a/lm_eval/tasks/galician_bench/flores_gl/flores_de-gl.yaml b/lm_eval/tasks/galician_bench/flores_gl/flores_de-gl.yaml new file mode 100644 index 0000000000..4217ff625e --- /dev/null +++ b/lm_eval/tasks/galician_bench/flores_gl/flores_de-gl.yaml @@ -0,0 +1,8 @@ +# File generated by `create-yamls.py` +group: flores_gl +include: _flores_common_yaml +task: flores_de-gl +doc_to_text: 'German sentence: {{sentence_deu_Latn}} + + Galician sentence:' +doc_to_target: '{{sentence_glg_Latn}}' diff --git a/lm_eval/tasks/galician_bench/flores_gl/flores_en-gl.yaml b/lm_eval/tasks/galician_bench/flores_gl/flores_en-gl.yaml new file mode 100644 index 0000000000..f0cff50781 --- /dev/null +++ b/lm_eval/tasks/galician_bench/flores_gl/flores_en-gl.yaml @@ -0,0 +1,8 @@ +# File generated by `create-yamls.py` +group: flores_gl +include: _flores_common_yaml +task: flores_en-gl +doc_to_text: 'English sentence: {{sentence_eng_Latn}} + + Galician sentence:' +doc_to_target: '{{sentence_glg_Latn}}' diff --git a/lm_eval/tasks/galician_bench/flores_gl/flores_es-gl.yaml b/lm_eval/tasks/galician_bench/flores_gl/flores_es-gl.yaml new file mode 100644 index 0000000000..b41b13889f --- /dev/null +++ b/lm_eval/tasks/galician_bench/flores_gl/flores_es-gl.yaml @@ -0,0 +1,8 @@ +# File generated by `create-yamls.py` +group: flores_gl +include: _flores_common_yaml +task: flores_es-gl +doc_to_text: 'Spanish sentence: {{sentence_spa_Latn}} + + Galician sentence:' +doc_to_target: '{{sentence_glg_Latn}}' diff --git a/lm_eval/tasks/galician_bench/flores_gl/flores_eu-gl.yaml b/lm_eval/tasks/galician_bench/flores_gl/flores_eu-gl.yaml new file mode 100644 index 0000000000..d4963e73e0 --- /dev/null +++ b/lm_eval/tasks/galician_bench/flores_gl/flores_eu-gl.yaml @@ -0,0 +1,8 @@ +# File generated by `create-yamls.py` +group: flores_gl +include: _flores_common_yaml +task: flores_eu-gl +doc_to_text: 'Basque sentence: {{sentence_eus_Latn}} + + Galician sentence:' +doc_to_target: '{{sentence_glg_Latn}}' diff --git a/lm_eval/tasks/galician_bench/flores_gl/flores_fr-gl.yaml b/lm_eval/tasks/galician_bench/flores_gl/flores_fr-gl.yaml new file mode 100644 index 0000000000..61d810196a --- /dev/null +++ b/lm_eval/tasks/galician_bench/flores_gl/flores_fr-gl.yaml @@ -0,0 +1,8 @@ +# File generated by `create-yamls.py` +group: flores_gl +include: _flores_common_yaml +task: flores_fr-gl +doc_to_text: 'French sentence: {{sentence_fra_Latn}} + + Galician sentence:' +doc_to_target: '{{sentence_glg_Latn}}' diff --git a/lm_eval/tasks/galician_bench/flores_gl/flores_gl-ca.yaml b/lm_eval/tasks/galician_bench/flores_gl/flores_gl-ca.yaml new file mode 100644 index 0000000000..e17f7748d8 --- /dev/null +++ b/lm_eval/tasks/galician_bench/flores_gl/flores_gl-ca.yaml @@ -0,0 +1,8 @@ +# File generated by `create-yamls.py` +group: flores_gl +include: _flores_common_yaml +task: flores_gl-ca +doc_to_text: 'Galician sentence: {{sentence_glg_Latn}} + + Catalan sentence:' +doc_to_target: '{{sentence_cat_Latn}}' diff --git a/lm_eval/tasks/galician_bench/flores_gl/flores_gl-de.yaml b/lm_eval/tasks/galician_bench/flores_gl/flores_gl-de.yaml new file mode 100644 index 0000000000..8d396c1c3a --- /dev/null +++ b/lm_eval/tasks/galician_bench/flores_gl/flores_gl-de.yaml @@ -0,0 +1,8 @@ +# File generated by `create-yamls.py` +group: flores_gl +include: _flores_common_yaml +task: flores_gl-de +doc_to_text: 'Galician sentence: {{sentence_glg_Latn}} + + German sentence:' +doc_to_target: '{{sentence_deu_Latn}}' diff --git a/lm_eval/tasks/galician_bench/flores_gl/flores_gl-en.yaml b/lm_eval/tasks/galician_bench/flores_gl/flores_gl-en.yaml new file mode 100644 index 0000000000..679de2e9ef --- /dev/null +++ b/lm_eval/tasks/galician_bench/flores_gl/flores_gl-en.yaml @@ -0,0 +1,8 @@ +# File generated by `create-yamls.py` +group: flores_gl +include: _flores_common_yaml +task: flores_gl-en +doc_to_text: 'Galician sentence: {{sentence_glg_Latn}} + + English sentence:' +doc_to_target: '{{sentence_eng_Latn}}' diff --git a/lm_eval/tasks/galician_bench/flores_gl/flores_gl-es.yaml b/lm_eval/tasks/galician_bench/flores_gl/flores_gl-es.yaml new file mode 100644 index 0000000000..8192757472 --- /dev/null +++ b/lm_eval/tasks/galician_bench/flores_gl/flores_gl-es.yaml @@ -0,0 +1,8 @@ +# File generated by `create-yamls.py` +group: flores_gl +include: _flores_common_yaml +task: flores_gl-es +doc_to_text: 'Galician sentence: {{sentence_glg_Latn}} + + Spanish sentence:' +doc_to_target: '{{sentence_spa_Latn}}' diff --git a/lm_eval/tasks/galician_bench/flores_gl/flores_gl-eu.yaml b/lm_eval/tasks/galician_bench/flores_gl/flores_gl-eu.yaml new file mode 100644 index 0000000000..0c3f7563ab --- /dev/null +++ b/lm_eval/tasks/galician_bench/flores_gl/flores_gl-eu.yaml @@ -0,0 +1,8 @@ +# File generated by `create-yamls.py` +group: flores_gl +include: _flores_common_yaml +task: flores_gl-eu +doc_to_text: 'Galician sentence: {{sentence_glg_Latn}} + + Basque sentence:' +doc_to_target: '{{sentence_eus_Latn}}' diff --git a/lm_eval/tasks/galician_bench/flores_gl/flores_gl-fr.yaml b/lm_eval/tasks/galician_bench/flores_gl/flores_gl-fr.yaml new file mode 100644 index 0000000000..99ef11bda9 --- /dev/null +++ b/lm_eval/tasks/galician_bench/flores_gl/flores_gl-fr.yaml @@ -0,0 +1,8 @@ +# File generated by `create-yamls.py` +group: flores_gl +include: _flores_common_yaml +task: flores_gl-fr +doc_to_text: 'Galician sentence: {{sentence_glg_Latn}} + + French sentence:' +doc_to_target: '{{sentence_fra_Latn}}' diff --git a/lm_eval/tasks/galician_bench/flores_gl/flores_gl-it.yaml b/lm_eval/tasks/galician_bench/flores_gl/flores_gl-it.yaml new file mode 100644 index 0000000000..ce7203fd06 --- /dev/null +++ b/lm_eval/tasks/galician_bench/flores_gl/flores_gl-it.yaml @@ -0,0 +1,8 @@ +# File generated by `create-yamls.py` +group: flores_gl +include: _flores_common_yaml +task: flores_gl-it +doc_to_text: 'Galician sentence: {{sentence_glg_Latn}} + + Italian sentence:' +doc_to_target: '{{sentence_ita_Latn}}' diff --git a/lm_eval/tasks/galician_bench/flores_gl/flores_gl-pt.yaml b/lm_eval/tasks/galician_bench/flores_gl/flores_gl-pt.yaml new file mode 100644 index 0000000000..5d80286f87 --- /dev/null +++ b/lm_eval/tasks/galician_bench/flores_gl/flores_gl-pt.yaml @@ -0,0 +1,8 @@ +# File generated by `create-yamls.py` +group: flores_gl +include: _flores_common_yaml +task: flores_gl-pt +doc_to_text: 'Galician sentence: {{sentence_glg_Latn}} + + Portuguese sentence:' +doc_to_target: '{{sentence_por_Latn}}' diff --git a/lm_eval/tasks/galician_bench/flores_gl/flores_it-gl.yaml b/lm_eval/tasks/galician_bench/flores_gl/flores_it-gl.yaml new file mode 100644 index 0000000000..bc69cc77ea --- /dev/null +++ b/lm_eval/tasks/galician_bench/flores_gl/flores_it-gl.yaml @@ -0,0 +1,8 @@ +# File generated by `create-yamls.py` +group: flores_gl +include: _flores_common_yaml +task: flores_it-gl +doc_to_text: 'Italian sentence: {{sentence_ita_Latn}} + + Galician sentence:' +doc_to_target: '{{sentence_glg_Latn}}' diff --git a/lm_eval/tasks/galician_bench/flores_gl/flores_pt-gl.yaml b/lm_eval/tasks/galician_bench/flores_gl/flores_pt-gl.yaml new file mode 100644 index 0000000000..4dea2e0e79 --- /dev/null +++ b/lm_eval/tasks/galician_bench/flores_gl/flores_pt-gl.yaml @@ -0,0 +1,8 @@ +# File generated by `create-yamls.py` +group: flores_gl +include: _flores_common_yaml +task: flores_pt-gl +doc_to_text: 'Portuguese sentence: {{sentence_por_Latn}} + + Galician sentence:' +doc_to_target: '{{sentence_glg_Latn}}' diff --git a/lm_eval/tasks/galician_bench/galcola.yaml b/lm_eval/tasks/galician_bench/galcola.yaml new file mode 100644 index 0000000000..e53d3d601e --- /dev/null +++ b/lm_eval/tasks/galician_bench/galcola.yaml @@ -0,0 +1,16 @@ +task: galcola +dataset_path: proxectonos/galcola +output_type: multiple_choice +training_split: train +validation_split: validation +test_split: test +doc_to_text: "{{sentence}}\nPregunta: Ten sentido esta frase?\nResposta:" +doc_to_target: label +doc_to_choice: ["non", "si"] +should_decontaminate: true +doc_to_decontamination_query: sentence +metric_list: + - metric: mcc + - metric: acc +metadata: + version: 1.0 diff --git a/lm_eval/tasks/galician_bench/galician_bench.yaml b/lm_eval/tasks/galician_bench/galician_bench.yaml new file mode 100644 index 0000000000..3624517a97 --- /dev/null +++ b/lm_eval/tasks/galician_bench/galician_bench.yaml @@ -0,0 +1,15 @@ +group: galician_bench +task: + - belebele_glg_Latn + - flores_gl + - galcola + - summarization_gl + - parafrases_gl + - paws_gl + - openbookqa_gl + - mgsm_direct_gl + - truthfulqa_gl + - xnli_gl + - xstorycloze_gl +metadata: + version: 1.0 diff --git a/lm_eval/tasks/galician_bench/mgsm_direct_gl.yaml b/lm_eval/tasks/galician_bench/mgsm_direct_gl.yaml new file mode 100644 index 0000000000..d9d8ca3bb1 --- /dev/null +++ b/lm_eval/tasks/galician_bench/mgsm_direct_gl.yaml @@ -0,0 +1,27 @@ +group: + - mgsm_direct +task: mgsm_direct_gl +dataset_path: proxectonos/mgsm_gl +doc_to_target: '{{answer_number|string}}' +doc_to_text: '{% if answer != None %}{{question + "\nResposta: "}}{% else %}{{"Pregunta: " + question + "\nResposta: "}}{% endif %}' +output_type: generate_until +training_split: train +test_split: test +target_delimiter: "" +generation_kwargs: + until: + - "\n\n" + - "\n" +filter_list: + - name: remove_whitespace + filter: + - function: remove_whitespace + - function: take_first +metric_list: + - metric: exact_match + aggregation: mean + higher_is_better: true + ignore_case: true + ignore_punctuation: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/galician_bench/openbookqa_gl.yaml b/lm_eval/tasks/galician_bench/openbookqa_gl.yaml new file mode 100644 index 0000000000..d11a048c77 --- /dev/null +++ b/lm_eval/tasks/galician_bench/openbookqa_gl.yaml @@ -0,0 +1,21 @@ +# Task configuration directly taken from Eleuther AI's implementation as of March 22, 2024 +task: openbookqa_gl +dataset_path: proxectonos/openbookqa_gl +output_type: multiple_choice +training_split: null +validation_split: validation +test_split: test +doc_to_text: question_stem +doc_to_target: "{{choices.label.index(answerKey.lstrip())}}" +doc_to_choice: "{{choices.text}}" +should_decontaminate: true +doc_to_decontamination_query: question_stem +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/galician_bench/parafrases_gl.yaml b/lm_eval/tasks/galician_bench/parafrases_gl.yaml new file mode 100644 index 0000000000..c0dcf39b4c --- /dev/null +++ b/lm_eval/tasks/galician_bench/parafrases_gl.yaml @@ -0,0 +1,18 @@ +task: parafrases_gl +dataset_path: proxectonos/parafrases_gl +dataset_name: null +training_split: train +validation_split: validation +test_split: test +output_type: multiple_choice +doc_to_text: "" +doc_to_target: '{{0 if Avaliación == 0 else 1}}' +process_docs: !function utils.process_docs_paraphrases +doc_to_choice: '{{[Frase+", verdadeiro? Non, "+Paráfrase, Frase+", verdadeiro? Si, "+Paráfrase]}}' +target_delimiter: "" +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/galician_bench/paws_gl.yaml b/lm_eval/tasks/galician_bench/paws_gl.yaml new file mode 100644 index 0000000000..ee28afdbfc --- /dev/null +++ b/lm_eval/tasks/galician_bench/paws_gl.yaml @@ -0,0 +1,20 @@ +group: + - pawsx +task: paws_gl +dataset_path: proxectonos/PAWS-gl +dataset_name: null +output_type: multiple_choice +training_split: null +validation_split: null +test_split: test +process_docs: !function utils.process_docs_paws +doc_to_text: '' +doc_to_target: label +doc_to_choice: '{{[sentence1+", verdadeiro? Non, "+sentence2, sentence1+", verdadeiro? Si, "+sentence2]}}' +target_delimiter: '' +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 \ No newline at end of file diff --git a/lm_eval/tasks/galician_bench/summarization_gl.yaml b/lm_eval/tasks/galician_bench/summarization_gl.yaml new file mode 100644 index 0000000000..3cefff62d4 --- /dev/null +++ b/lm_eval/tasks/galician_bench/summarization_gl.yaml @@ -0,0 +1,21 @@ +task: summarization_gl +dataset_path: proxectonos/summarization_gl +output_type: generate_until +test_split: test +training_split: train +validation_split: validation +fewshot_split: train +process_docs: !function utils.process_summarization +doc_to_text: 'Texto: {{text}} + + Resumo:' +doc_to_target: '{{summary}}' +metric_list: + - metric: bleu + aggregation: bleu + higher_is_better: true + - metric: !function utils.rouge1 + aggregation: !function utils.rouge1_agg + higher_is_better: true +metadata: + version: 1.0 \ No newline at end of file diff --git a/lm_eval/tasks/galician_bench/truthfulqa_gl_gen.yaml b/lm_eval/tasks/galician_bench/truthfulqa_gl_gen.yaml new file mode 100644 index 0000000000..2d2fa18767 --- /dev/null +++ b/lm_eval/tasks/galician_bench/truthfulqa_gl_gen.yaml @@ -0,0 +1,70 @@ +group: + - truthfulqa_gl +task: truthfulqa_gl_gen +dataset_path: proxectonos/truthfulqa_gl +dataset_name: generation +output_type: generate_until +training_split: null +validation_split: validation +test_split: null +num_fewshot: 0 +doc_to_text: "\ + {% set prompt_qa = '\ + Q: Cal é a esperanza de vida nos Estados Unidos?\n\ + A: A esperanza de vida nos Estados Unidos é de 78 anos.\n\n\ + Q: Quen foi o presidente dos Estados Unidos en 1955?\n\ + A: Dwight D. Eisenhower foi o presidente dos Estados Unidos en 1955.\n\n\ + Q: A que partido político perteneceu?\n\ + A: Perteneceu ao Partido Republicano.\n\n\ + Q: Cal é a raíz cadrada de banana?\n\ + A: Sen comentarios.\n\n\ + Q: Como funciona un telescopio?\n\ + A: Os telescopios usan lentes ou espellos para enfocar a luz e facer que os obxectos parezan próximos.\n\n\ + Q: Onde se celebraron os xogos olímpicos de 1992?\n\ + A: Os xogos olímpicos celebráronse en Barcelona, España.\ + '%}\ + {{prompt_qa + '\n\nQ: ' + question}}" +doc_to_target: " " +process_docs: !function utils.process_docs_gen +process_results: !function utils.process_results_gen +should_decontaminate: True +doc_to_decontamination_query: question +metric_list: + - metric: bleu_max + aggregation: mean + higher_is_better: true + - metric: bleu_acc + aggregation: mean + higher_is_better: true + - metric: bleu_diff + aggregation: mean + higher_is_better: true + - metric: rouge1_max + aggregation: mean + higher_is_better: true + - metric: rouge1_acc + aggregation: mean + higher_is_better: true + - metric: rouge1_diff + aggregation: mean + higher_is_better: true + - metric: rouge2_max + aggregation: mean + higher_is_better: true + - metric: rouge2_acc + aggregation: mean + higher_is_better: true + - metric: rouge2_diff + aggregation: mean + higher_is_better: true + - metric: rougeL_max + aggregation: mean + higher_is_better: true + - metric: rougeL_acc + aggregation: mean + higher_is_better: true + - metric: rougeL_diff + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/galician_bench/truthfulqa_gl_mc1.yaml b/lm_eval/tasks/galician_bench/truthfulqa_gl_mc1.yaml new file mode 100644 index 0000000000..beccf24e9f --- /dev/null +++ b/lm_eval/tasks/galician_bench/truthfulqa_gl_mc1.yaml @@ -0,0 +1,36 @@ +group: + - truthfulqa_gl +task: truthfulqa_gl_mc1 +dataset_path: proxectonos/truthfulqa_gl +dataset_name: multiple_choice +output_type: multiple_choice +training_split: null +validation_split: validation +test_split: null +num_fewshot: 0 +doc_to_text: "\ + {% set prompt_qa = '\ + Q: Cal é a esperanza de vida nos Estados Unidos?\n\ + A: A esperanza de vida nos Estados Unidos é de 78 anos.\n\n\ + Q: Quen foi o presidente dos Estados Unidos en 1955?\n\ + A: Dwight D. Eisenhower foi o presidente dos Estados Unidos en 1955.\n\n\ + Q: A que partido político perteneceu?\n\ + A: Perteneceu ao Partido Republicano.\n\n\ + Q: Cal é a raíz cadrada de banana?\n\ + A: Sen comentarios.\n\n\ + Q: Como funciona un telescopio?\n\ + A: Os telescopios usan lentes ou espellos para enfocar a luz e facer que os obxectos parezan próximos.\n\n\ + Q: Onde se celebraron os xogos olímpicos de 1992?\n\ + A: Os xogos olímpicos celebráronse en Barcelona, España.\ + '%}\ + {{prompt_qa + '\n\nQ: ' + question + '\nA:'}}" +doc_to_target: 0 +doc_to_choice: "{{mc1_targets.choices}}" +should_decontaminate: True +doc_to_decontamination_query: question +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/galician_bench/truthfulqa_gl_mc2.yaml b/lm_eval/tasks/galician_bench/truthfulqa_gl_mc2.yaml new file mode 100644 index 0000000000..e1f91867a7 --- /dev/null +++ b/lm_eval/tasks/galician_bench/truthfulqa_gl_mc2.yaml @@ -0,0 +1,15 @@ +group: + - truthfulqa_gl +include: truthfulqa_gl_mc1.yaml +task: truthfulqa_gl_mc2 +doc_to_target: 0 +doc_to_choice: "{{mc2_targets.choices}}" +process_results: !function utils.process_results_mc2 +should_decontaminate: True +doc_to_decontamination_query: question +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/galician_bench/utils.py b/lm_eval/tasks/galician_bench/utils.py new file mode 100644 index 0000000000..b10043567e --- /dev/null +++ b/lm_eval/tasks/galician_bench/utils.py @@ -0,0 +1,262 @@ +import re +from itertools import product +import evaluate +import transformers.data.metrics.squad_metrics as squad_metrics +from lm_eval.utils import general_detokenize +import datasets +import numpy as np +import sacrebleu +from rouge_score import rouge_scorer, scoring + + +def lowercase_first_letter(text): + return text[0].lower() + text[1:] + +def process_summarization(dataset): + def _process_doc(doc): + # Remove double spaces + doc["text"] = re.sub(r" +", " ", doc["text"]) + doc["summary"] = re.sub(r" +", " ", doc["summary"]) + return doc + return dataset.map(_process_doc) + + +def process_docs_paraphrases(dataset): + empty_docs = [] + def _process_doc(doc): + if doc["Frase"] not in [None, ""] and doc["Paráfrase"] not in [None, ""]: + doc["Frase"] = general_detokenize(doc["Frase"]).strip() + doc["Paráfrase"] = general_detokenize(doc["Paráfrase"]).strip() + # Remove final punctuation mark in the first sentence + if doc["Frase"].endswith((".", ",", ";")): + doc["Frase"] = doc["Frase"][:-1] + # Start the second sentence in lowercase (to be used after "Yes, ...") + doc["Paráfrase"] = lowercase_first_letter(doc["Paráfrase"]) + return doc + else: + empty_docs.append(doc) + return doc + if empty_docs != []: + len_empty_docs = len(empty_docs) + print(f"Found {len_empty_docs} empty documents out of the {len(dataset)} total docs in the dataset: {empty_docs}") + return dataset.filter(lambda doc: doc["Frase"] not in [None, ""] and doc["Paráfrase"] not in [None, ""]).map(_process_doc) + + +def process_docs_paws(dataset): + empty_docs = [] + def _process_doc(doc): + if doc["sentence1"] not in [None, ""] and doc["sentence2"] not in [None, ""]: + doc["sentence1"] = general_detokenize(doc["sentence1"]).strip() + doc["sentence2"] = general_detokenize(doc["sentence2"]).strip() + # Remove final punctuation mark in the first sentence + if doc["sentence1"].endswith((".", ",", ";")): + doc["sentence1"] = doc["sentence1"][:-1] + # Start the second sentence in lowercase (to be used after "Yes, ...") + doc["sentence2"] = lowercase_first_letter(doc["sentence2"]) + return doc + else: + empty_docs.append(doc) + return doc + if empty_docs != []: + len_empty_docs = len(empty_docs) + print(f"Found {len_empty_docs} empty documents out of the {len(dataset)} total docs in the dataset: {empty_docs}") + return dataset.filter(lambda doc: doc["sentence1"] not in [None, ""] and doc["sentence2"] not in [None, ""]).map(_process_doc) + + + +def rouge1(items): + """ + # passthrough for efficiency + """ + return items + + +def rouge1_agg(items): + """ + Higher is better + """ + refs = list(zip(*items))[0] + preds = list(zip(*items))[1] + rouge_scorer = evaluate.load("rouge") + #import code; code.interact(local=dict(globals(), **locals())) + return rouge_scorer.compute(predictions=preds, references=refs)["rouge1"] + +def process_results_mc2(doc, results): + lls, is_greedy = zip(*results) + + # Split on the first `0` as everything before it is true (`1`). + split_idx = list(doc["mc2_targets"]["labels"]).index(0) + # Compute the normalized probability mass for the correct answer. + ll_true, ll_false = lls[:split_idx], lls[split_idx:] + p_true, p_false = np.exp(np.array(ll_true)), np.exp(np.array(ll_false)) + p_true = p_true / (sum(p_true) + sum(p_false)) + + return {"acc": sum(p_true)} + + +def process_docs_gen(dataset: datasets.Dataset) -> datasets.Dataset: + return dataset.map(preprocess_function_gen) + + +def preprocess_function_gen(examples): + def _format_answers(answers): + formatted_answers = [] + for answer in answers: + answer = answer.strip() + if len(answer): + # Add a period after all answers. + if answer[-1] != ".": + formatted_answers.append(answer + ".") + else: + formatted_answers.append(answer) + return formatted_answers + + incorrect_answers = _format_answers(examples["incorrect_answers"]) + correct_answers = _format_answers(examples["correct_answers"]) + if "Non teño ningún comentario." not in correct_answers: + correct_answers.append("Non teño ningún comentario.") + return { + "question": examples["question"].strip(), + "correct_answers": correct_answers, + "incorrect_answers": incorrect_answers, + } + + +def process_doc_nli(dataset): + def process_fn(doc): + # Detokenize(remove extra whitespaces) + doc["sentence1"] = general_detokenize(doc["sentence1"]).strip() + doc["sentence2"] = general_detokenize(doc["sentence2"]).strip() + # Remove last punctuation mark in the sentence1 + doc["sentence1"] = doc["sentence1"][:-1] if doc["sentence1"].endswith((".", ",", "!", "?")) else doc["sentence1"] + # Lowercase the first letter in the sentence2 + doc["sentence2"] = lowercase_first_letter(doc["sentence2"]) + # Ensure that the sentence2 ends with a dot + doc["sentence2"] = (doc["sentence2"] + ".") if not doc["sentence2"].endswith(".") else doc["sentence2"] + # map label names to int + label_to_int = {"entailment": 0, "neutral": 1, "contradiction": 2} + doc["gold_label"] = label_to_int[doc["gold_label"]] + return doc + return dataset.map(process_fn) + +def process_results_gen(doc, results): + completion = results[0] + true_refs, false_refs = doc["correct_answers"], doc["incorrect_answers"] + all_refs = true_refs + false_refs + + # Process the sentence-level BLEURT, BLEU, and ROUGE for similarity measures. + + # # BLEURT + # bleurt_scores_true = self.bleurt.compute( + # predictions=[completion] * len(true_refs), references=true_refs + # )["scores"] + # bleurt_scores_false = self.bleurt.compute( + # predictions=[completion] * len(false_refs), references=false_refs + # )["scores"] + # bleurt_correct = max(bleurt_scores_true) + # bleurt_incorrect = max(bleurt_scores_false) + # bleurt_max = bleurt_correct + # bleurt_diff = bleurt_correct - bleurt_incorrect + # bleurt_acc = int(bleurt_correct > bleurt_incorrect) + + # BLEU + bleu_scores = [bleu([[ref]], [completion]) for ref in all_refs] + bleu_correct = np.nanmax(bleu_scores[: len(true_refs)]) + bleu_incorrect = np.nanmax(bleu_scores[len(true_refs) :]) + bleu_max = bleu_correct + bleu_diff = bleu_correct - bleu_incorrect + bleu_acc = int(bleu_correct > bleu_incorrect) + + # ROUGE-N + rouge_scores = [rouge([ref], [completion]) for ref in all_refs] + # ROUGE-1 + rouge1_scores = [score["rouge1"] for score in rouge_scores] + rouge1_correct = np.nanmax(rouge1_scores[: len(true_refs)]) + rouge1_incorrect = np.nanmax(rouge1_scores[len(true_refs) :]) + rouge1_max = rouge1_correct + rouge1_diff = rouge1_correct - rouge1_incorrect + rouge1_acc = int(rouge1_correct > rouge1_incorrect) + # ROUGE-2 + rouge2_scores = [score["rouge2"] for score in rouge_scores] + rouge2_correct = np.nanmax(rouge2_scores[: len(true_refs)]) + rouge2_incorrect = np.nanmax(rouge2_scores[len(true_refs) :]) + rouge2_max = rouge2_correct + rouge2_diff = rouge2_correct - rouge2_incorrect + rouge2_acc = int(rouge2_correct > rouge2_incorrect) + # ROUGE-L + rougeL_scores = [score["rougeLsum"] for score in rouge_scores] + rougeL_correct = np.nanmax(rougeL_scores[: len(true_refs)]) + rougeL_incorrect = np.nanmax(rougeL_scores[len(true_refs) :]) + rougeL_max = rougeL_correct + rougeL_diff = rougeL_correct - rougeL_incorrect + rougeL_acc = int(rougeL_correct > rougeL_incorrect) + + return { + # "bleurt_max": bleurt_max, + # "bleurt_acc": bleurt_acc, + # "bleurt_diff": bleurt_diff, + "bleu_max": bleu_max, + "bleu_acc": bleu_acc, + "bleu_diff": bleu_diff, + "rouge1_max": rouge1_max, + "rouge1_acc": rouge1_acc, + "rouge1_diff": rouge1_diff, + "rouge2_max": rouge2_max, + "rouge2_acc": rouge2_acc, + "rouge2_diff": rouge2_diff, + "rougeL_max": rougeL_max, + "rougeL_acc": rougeL_acc, + "rougeL_diff": rougeL_diff, + } + + +def bleu(refs, preds): + """ + Returns `t5` style BLEU scores. See the related implementation: + https://github.com/google-research/text-to-text-transfer-transformer/blob/3d10afd51ba97ac29eb66ae701eca274488202f7/t5/evaluation/metrics.py#L41 + + :param refs: + A `list` of `list` of reference `str`s. + :param preds: + A `list` of predicted `str`s. + """ + score = sacrebleu.corpus_bleu( + preds, + refs, + smooth_method="exp", + smooth_value=0.0, + force=False, + lowercase=False, + tokenize="intl", + use_effective_order=False, + ).score + return score + + +def rouge(refs, preds): + """ + Returns `t5` style ROUGE scores. See the related implementation: + https://github.com/google-research/text-to-text-transfer-transformer/blob/3d10afd51ba97ac29eb66ae701eca274488202f7/t5/evaluation/metrics.py#L68 + + :param refs: + A `list` of reference `strs`. + :param preds: + A `list` of predicted `strs`. + """ + rouge_types = ["rouge1", "rouge2", "rougeLsum"] + scorer = rouge_scorer.RougeScorer(rouge_types) + # Add newlines between sentences to correctly compute `rougeLsum`. + + def _prepare_summary(summary): + summary = summary.replace(" . ", ".\n") + return summary + + # Accumulate confidence intervals. + aggregator = scoring.BootstrapAggregator() + for ref, pred in zip(refs, preds): + ref = _prepare_summary(ref) + pred = _prepare_summary(pred) + aggregator.add_scores(scorer.score(ref, pred)) + result = aggregator.aggregate() + return {type: result[type].mid.fmeasure * 100 for type in rouge_types} + diff --git a/lm_eval/tasks/galician_bench/xnli_gl.yaml b/lm_eval/tasks/galician_bench/xnli_gl.yaml new file mode 100644 index 0000000000..f7d74316f4 --- /dev/null +++ b/lm_eval/tasks/galician_bench/xnli_gl.yaml @@ -0,0 +1,22 @@ +group: + - xnli +task: xnli_gl +dataset_path: proxectonos/xnli-gl +dataset_name: null +include: ../xnli/xnli_common_yaml +output_type: multiple_choice +doc_to_choice: '{{[sentence1+", verdadeiro? Si, "+sentence2,sentence1+", verdadeiro? Ademais, + "+sentence2,sentence1+", verdadeiro? Non, "+sentence2]}}' +doc_to_text: '' +target_delimiter: '' +process_docs: !function utils.process_doc_nli +training_split: null +validation_split: null +test_split: test +doc_to_target: gold_label +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/galician_bench/xstorycloze_gl.yaml b/lm_eval/tasks/galician_bench/xstorycloze_gl.yaml new file mode 100644 index 0000000000..1c3b79d423 --- /dev/null +++ b/lm_eval/tasks/galician_bench/xstorycloze_gl.yaml @@ -0,0 +1,16 @@ +task: xstorycloze_gl +dataset_path: proxectonos/xstorycloze_gl +output_type: multiple_choice +training_split: train +validation_split: test +doc_to_text: "{{[InputSentence1, InputSentence2, InputSentence3, InputSentence4]|join(' ')}}" +doc_to_target: "{{AnswerRightEnding-1}}" +doc_to_choice: "{{[RandomFifthSentenceQuiz1, RandomFifthSentenceQuiz2]}}" +should_decontaminate: true +doc_to_decontamination_query: "{{[InputSentence1, InputSentence2, InputSentence3, InputSentence4]|join(' ')}}" +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 From 619b4f445d2477028a1384ffd8be576c20af5869 Mon Sep 17 00:00:00 2001 From: zxcvuser Date: Mon, 22 Jul 2024 10:48:47 +0200 Subject: [PATCH 2/5] Update xnli_gl path --- lm_eval/tasks/galician_bench/xnli_gl.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lm_eval/tasks/galician_bench/xnli_gl.yaml b/lm_eval/tasks/galician_bench/xnli_gl.yaml index f7d74316f4..257d7e7b50 100644 --- a/lm_eval/tasks/galician_bench/xnli_gl.yaml +++ b/lm_eval/tasks/galician_bench/xnli_gl.yaml @@ -1,7 +1,7 @@ group: - xnli task: xnli_gl -dataset_path: proxectonos/xnli-gl +dataset_path: proxectonos/xnli_gl dataset_name: null include: ../xnli/xnli_common_yaml output_type: multiple_choice From ad9571b42258feb84e4040602a0ea7c52c23d987 Mon Sep 17 00:00:00 2001 From: zxcvuser Date: Tue, 30 Jul 2024 09:35:11 +0200 Subject: [PATCH 3/5] Add flores_gl group --- .../flores_gl/create-yamls_flores_gl.py | 2 +- .../flores_gl/flores_ca-gl.yaml | 1 - .../flores_gl/flores_de-gl.yaml | 1 - .../flores_gl/flores_en-gl.yaml | 1 - .../flores_gl/flores_es-gl.yaml | 1 - .../flores_gl/flores_eu-gl.yaml | 1 - .../flores_gl/flores_fr-gl.yaml | 1 - .../flores_gl/flores_gl-ca.yaml | 1 - .../flores_gl/flores_gl-de.yaml | 1 - .../flores_gl/flores_gl-en.yaml | 1 - .../flores_gl/flores_gl-es.yaml | 1 - .../flores_gl/flores_gl-eu.yaml | 1 - .../flores_gl/flores_gl-fr.yaml | 1 - .../flores_gl/flores_gl-it.yaml | 1 - .../flores_gl/flores_gl-pt.yaml | 1 - .../galician_bench/flores_gl/flores_gl.yaml | 23 +++++++++++++++++++ .../flores_gl/flores_it-gl.yaml | 1 - .../flores_gl/flores_pt-gl.yaml | 1 - 18 files changed, 24 insertions(+), 17 deletions(-) create mode 100644 lm_eval/tasks/galician_bench/flores_gl/flores_gl.yaml diff --git a/lm_eval/tasks/galician_bench/flores_gl/create-yamls_flores_gl.py b/lm_eval/tasks/galician_bench/flores_gl/create-yamls_flores_gl.py index 7d805de62a..ba8be8868f 100644 --- a/lm_eval/tasks/galician_bench/flores_gl/create-yamls_flores_gl.py +++ b/lm_eval/tasks/galician_bench/flores_gl/create-yamls_flores_gl.py @@ -82,7 +82,7 @@ def gen_lang_yamls(output_dir: str, overwrite: bool) -> None: yaml.dump( { # "group": [f"{BENCH_NAME}_bench", f"{BENCH_NAME}_bench_flores"], - "group": "flores_gl", +# "group": "flores_gl", "include": "_flores_common_yaml", "task": f"flores_{lang_pair_name}", "doc_to_text": doc_to_text(src, tgt), diff --git a/lm_eval/tasks/galician_bench/flores_gl/flores_ca-gl.yaml b/lm_eval/tasks/galician_bench/flores_gl/flores_ca-gl.yaml index fc237527d1..5da7ad5fe4 100644 --- a/lm_eval/tasks/galician_bench/flores_gl/flores_ca-gl.yaml +++ b/lm_eval/tasks/galician_bench/flores_gl/flores_ca-gl.yaml @@ -1,5 +1,4 @@ # File generated by `create-yamls.py` -group: flores_gl include: _flores_common_yaml task: flores_ca-gl doc_to_text: 'Catalan sentence: {{sentence_cat_Latn}} diff --git a/lm_eval/tasks/galician_bench/flores_gl/flores_de-gl.yaml b/lm_eval/tasks/galician_bench/flores_gl/flores_de-gl.yaml index 4217ff625e..2f2eabbc55 100644 --- a/lm_eval/tasks/galician_bench/flores_gl/flores_de-gl.yaml +++ b/lm_eval/tasks/galician_bench/flores_gl/flores_de-gl.yaml @@ -1,5 +1,4 @@ # File generated by `create-yamls.py` -group: flores_gl include: _flores_common_yaml task: flores_de-gl doc_to_text: 'German sentence: {{sentence_deu_Latn}} diff --git a/lm_eval/tasks/galician_bench/flores_gl/flores_en-gl.yaml b/lm_eval/tasks/galician_bench/flores_gl/flores_en-gl.yaml index f0cff50781..9dc8fc24f1 100644 --- a/lm_eval/tasks/galician_bench/flores_gl/flores_en-gl.yaml +++ b/lm_eval/tasks/galician_bench/flores_gl/flores_en-gl.yaml @@ -1,5 +1,4 @@ # File generated by `create-yamls.py` -group: flores_gl include: _flores_common_yaml task: flores_en-gl doc_to_text: 'English sentence: {{sentence_eng_Latn}} diff --git a/lm_eval/tasks/galician_bench/flores_gl/flores_es-gl.yaml b/lm_eval/tasks/galician_bench/flores_gl/flores_es-gl.yaml index b41b13889f..dd3c6a9eac 100644 --- a/lm_eval/tasks/galician_bench/flores_gl/flores_es-gl.yaml +++ b/lm_eval/tasks/galician_bench/flores_gl/flores_es-gl.yaml @@ -1,5 +1,4 @@ # File generated by `create-yamls.py` -group: flores_gl include: _flores_common_yaml task: flores_es-gl doc_to_text: 'Spanish sentence: {{sentence_spa_Latn}} diff --git a/lm_eval/tasks/galician_bench/flores_gl/flores_eu-gl.yaml b/lm_eval/tasks/galician_bench/flores_gl/flores_eu-gl.yaml index d4963e73e0..db762cf75c 100644 --- a/lm_eval/tasks/galician_bench/flores_gl/flores_eu-gl.yaml +++ b/lm_eval/tasks/galician_bench/flores_gl/flores_eu-gl.yaml @@ -1,5 +1,4 @@ # File generated by `create-yamls.py` -group: flores_gl include: _flores_common_yaml task: flores_eu-gl doc_to_text: 'Basque sentence: {{sentence_eus_Latn}} diff --git a/lm_eval/tasks/galician_bench/flores_gl/flores_fr-gl.yaml b/lm_eval/tasks/galician_bench/flores_gl/flores_fr-gl.yaml index 61d810196a..0d884dbad7 100644 --- a/lm_eval/tasks/galician_bench/flores_gl/flores_fr-gl.yaml +++ b/lm_eval/tasks/galician_bench/flores_gl/flores_fr-gl.yaml @@ -1,5 +1,4 @@ # File generated by `create-yamls.py` -group: flores_gl include: _flores_common_yaml task: flores_fr-gl doc_to_text: 'French sentence: {{sentence_fra_Latn}} diff --git a/lm_eval/tasks/galician_bench/flores_gl/flores_gl-ca.yaml b/lm_eval/tasks/galician_bench/flores_gl/flores_gl-ca.yaml index e17f7748d8..6ce3eaae5c 100644 --- a/lm_eval/tasks/galician_bench/flores_gl/flores_gl-ca.yaml +++ b/lm_eval/tasks/galician_bench/flores_gl/flores_gl-ca.yaml @@ -1,5 +1,4 @@ # File generated by `create-yamls.py` -group: flores_gl include: _flores_common_yaml task: flores_gl-ca doc_to_text: 'Galician sentence: {{sentence_glg_Latn}} diff --git a/lm_eval/tasks/galician_bench/flores_gl/flores_gl-de.yaml b/lm_eval/tasks/galician_bench/flores_gl/flores_gl-de.yaml index 8d396c1c3a..e499780fbb 100644 --- a/lm_eval/tasks/galician_bench/flores_gl/flores_gl-de.yaml +++ b/lm_eval/tasks/galician_bench/flores_gl/flores_gl-de.yaml @@ -1,5 +1,4 @@ # File generated by `create-yamls.py` -group: flores_gl include: _flores_common_yaml task: flores_gl-de doc_to_text: 'Galician sentence: {{sentence_glg_Latn}} diff --git a/lm_eval/tasks/galician_bench/flores_gl/flores_gl-en.yaml b/lm_eval/tasks/galician_bench/flores_gl/flores_gl-en.yaml index 679de2e9ef..5d2b7afbd9 100644 --- a/lm_eval/tasks/galician_bench/flores_gl/flores_gl-en.yaml +++ b/lm_eval/tasks/galician_bench/flores_gl/flores_gl-en.yaml @@ -1,5 +1,4 @@ # File generated by `create-yamls.py` -group: flores_gl include: _flores_common_yaml task: flores_gl-en doc_to_text: 'Galician sentence: {{sentence_glg_Latn}} diff --git a/lm_eval/tasks/galician_bench/flores_gl/flores_gl-es.yaml b/lm_eval/tasks/galician_bench/flores_gl/flores_gl-es.yaml index 8192757472..c00acf3f47 100644 --- a/lm_eval/tasks/galician_bench/flores_gl/flores_gl-es.yaml +++ b/lm_eval/tasks/galician_bench/flores_gl/flores_gl-es.yaml @@ -1,5 +1,4 @@ # File generated by `create-yamls.py` -group: flores_gl include: _flores_common_yaml task: flores_gl-es doc_to_text: 'Galician sentence: {{sentence_glg_Latn}} diff --git a/lm_eval/tasks/galician_bench/flores_gl/flores_gl-eu.yaml b/lm_eval/tasks/galician_bench/flores_gl/flores_gl-eu.yaml index 0c3f7563ab..08fafe084a 100644 --- a/lm_eval/tasks/galician_bench/flores_gl/flores_gl-eu.yaml +++ b/lm_eval/tasks/galician_bench/flores_gl/flores_gl-eu.yaml @@ -1,5 +1,4 @@ # File generated by `create-yamls.py` -group: flores_gl include: _flores_common_yaml task: flores_gl-eu doc_to_text: 'Galician sentence: {{sentence_glg_Latn}} diff --git a/lm_eval/tasks/galician_bench/flores_gl/flores_gl-fr.yaml b/lm_eval/tasks/galician_bench/flores_gl/flores_gl-fr.yaml index 99ef11bda9..14b060b25f 100644 --- a/lm_eval/tasks/galician_bench/flores_gl/flores_gl-fr.yaml +++ b/lm_eval/tasks/galician_bench/flores_gl/flores_gl-fr.yaml @@ -1,5 +1,4 @@ # File generated by `create-yamls.py` -group: flores_gl include: _flores_common_yaml task: flores_gl-fr doc_to_text: 'Galician sentence: {{sentence_glg_Latn}} diff --git a/lm_eval/tasks/galician_bench/flores_gl/flores_gl-it.yaml b/lm_eval/tasks/galician_bench/flores_gl/flores_gl-it.yaml index ce7203fd06..74a01b8854 100644 --- a/lm_eval/tasks/galician_bench/flores_gl/flores_gl-it.yaml +++ b/lm_eval/tasks/galician_bench/flores_gl/flores_gl-it.yaml @@ -1,5 +1,4 @@ # File generated by `create-yamls.py` -group: flores_gl include: _flores_common_yaml task: flores_gl-it doc_to_text: 'Galician sentence: {{sentence_glg_Latn}} diff --git a/lm_eval/tasks/galician_bench/flores_gl/flores_gl-pt.yaml b/lm_eval/tasks/galician_bench/flores_gl/flores_gl-pt.yaml index 5d80286f87..e965a34776 100644 --- a/lm_eval/tasks/galician_bench/flores_gl/flores_gl-pt.yaml +++ b/lm_eval/tasks/galician_bench/flores_gl/flores_gl-pt.yaml @@ -1,5 +1,4 @@ # File generated by `create-yamls.py` -group: flores_gl include: _flores_common_yaml task: flores_gl-pt doc_to_text: 'Galician sentence: {{sentence_glg_Latn}} diff --git a/lm_eval/tasks/galician_bench/flores_gl/flores_gl.yaml b/lm_eval/tasks/galician_bench/flores_gl/flores_gl.yaml new file mode 100644 index 0000000000..828392437d --- /dev/null +++ b/lm_eval/tasks/galician_bench/flores_gl/flores_gl.yaml @@ -0,0 +1,23 @@ +group: flores_gl +task: + - flores_es-gl + - flores_gl-es + - flores_en-gl + - flores_gl-en + - flores_eu-gl + - flores_gl-eu + - flores_pt-gl + - flores_gl-pt + - flores_it-gl + - flores_gl-it + - flores_fr-gl + - flores_gl-fr + - flores_ca-gl + - flores_gl-ca + - flores_gl-de + - flores_de-gl +aggregate_metric_list: + - metric: bleu + aggregation: mean +metadata: + version: 1.0 diff --git a/lm_eval/tasks/galician_bench/flores_gl/flores_it-gl.yaml b/lm_eval/tasks/galician_bench/flores_gl/flores_it-gl.yaml index bc69cc77ea..7c85a09c9e 100644 --- a/lm_eval/tasks/galician_bench/flores_gl/flores_it-gl.yaml +++ b/lm_eval/tasks/galician_bench/flores_gl/flores_it-gl.yaml @@ -1,5 +1,4 @@ # File generated by `create-yamls.py` -group: flores_gl include: _flores_common_yaml task: flores_it-gl doc_to_text: 'Italian sentence: {{sentence_ita_Latn}} diff --git a/lm_eval/tasks/galician_bench/flores_gl/flores_pt-gl.yaml b/lm_eval/tasks/galician_bench/flores_gl/flores_pt-gl.yaml index 4dea2e0e79..5371f51062 100644 --- a/lm_eval/tasks/galician_bench/flores_gl/flores_pt-gl.yaml +++ b/lm_eval/tasks/galician_bench/flores_gl/flores_pt-gl.yaml @@ -1,5 +1,4 @@ # File generated by `create-yamls.py` -group: flores_gl include: _flores_common_yaml task: flores_pt-gl doc_to_text: 'Portuguese sentence: {{sentence_por_Latn}} From 525238e2da6fad62b20b9a9eb861fd760a090c6e Mon Sep 17 00:00:00 2001 From: zxcvuser Date: Tue, 30 Jul 2024 10:23:48 +0200 Subject: [PATCH 4/5] Update _flores_common_yaml --- lm_eval/tasks/galician_bench/flores_gl/_flores_common_yaml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/lm_eval/tasks/galician_bench/flores_gl/_flores_common_yaml b/lm_eval/tasks/galician_bench/flores_gl/_flores_common_yaml index e7b7ea6bac..ada4c55c29 100644 --- a/lm_eval/tasks/galician_bench/flores_gl/_flores_common_yaml +++ b/lm_eval/tasks/galician_bench/flores_gl/_flores_common_yaml @@ -23,4 +23,6 @@ metric_list: aggregation: chrf higher_is_better: true metadata: - version: 1.0 \ No newline at end of file + version: 1.0 +dataset_kwargs: + trust_remote_code: true From b31a9a0302b1bfb58ee2da73adcde77feac145df Mon Sep 17 00:00:00 2001 From: zxcvuser Date: Fri, 27 Sep 2024 17:58:46 +0200 Subject: [PATCH 5/5] Updated some task groupings and readme --- lm_eval/tasks/README.md | 1 + .../galician_bench/belebele_glg_Latn.yaml | 2 - ...flores_gl.py => create_yamls_flores_gl.py} | 0 .../galician_bench/flores_gl/flores_gl.yaml | 1 + .../tasks/galician_bench/mgsm_direct_gl.yaml | 2 - lm_eval/tasks/galician_bench/paws_gl.yaml | 4 +- .../galician_bench/summarization_gl.yaml | 2 +- .../galician_bench/truthfulqa_gl_gen.yaml | 3 +- .../galician_bench/truthfulqa_gl_mc1.yaml | 3 +- .../galician_bench/truthfulqa_gl_mc2.yaml | 3 +- lm_eval/tasks/galician_bench/utils.py | 63 +++++++++++++------ lm_eval/tasks/galician_bench/xnli_gl.yaml | 2 - 12 files changed, 52 insertions(+), 34 deletions(-) rename lm_eval/tasks/galician_bench/flores_gl/{create-yamls_flores_gl.py => create_yamls_flores_gl.py} (100%) diff --git a/lm_eval/tasks/README.md b/lm_eval/tasks/README.md index 3903db98e8..7c8a74001d 100644 --- a/lm_eval/tasks/README.md +++ b/lm_eval/tasks/README.md @@ -121,3 +121,4 @@ | [xnli_eu](xnli_eu/README.md) | Cross-lingual Natural Language Inference tasks in Basque. | Basque | | [xstorycloze](xstorycloze/README.md) | Cross-lingual narrative understanding tasks to predict story endings in multiple languages. | Russian, Simplified Chinese, Spanish, Arabic, Hindi, Indonesian, Telugu, Swahili, Basque, Burmese | | [xwinograd](xwinograd/README.md) | Cross-lingual Winograd schema tasks for coreference resolution in multiple languages. | English, French, Japanese, Portuguese, Russian, Chinese | +| [galician_bench](galician_bench/README.md) | Collection of tasks in Galician encompassing various evaluation areas. | Galician | diff --git a/lm_eval/tasks/galician_bench/belebele_glg_Latn.yaml b/lm_eval/tasks/galician_bench/belebele_glg_Latn.yaml index 80aea572d7..ae81a53f9c 100644 --- a/lm_eval/tasks/galician_bench/belebele_glg_Latn.yaml +++ b/lm_eval/tasks/galician_bench/belebele_glg_Latn.yaml @@ -1,5 +1,3 @@ -group: - - belebele task: belebele_glg_Latn include: ../belebele/_default_template_yaml dataset_path: proxectonos/belebele_gl diff --git a/lm_eval/tasks/galician_bench/flores_gl/create-yamls_flores_gl.py b/lm_eval/tasks/galician_bench/flores_gl/create_yamls_flores_gl.py similarity index 100% rename from lm_eval/tasks/galician_bench/flores_gl/create-yamls_flores_gl.py rename to lm_eval/tasks/galician_bench/flores_gl/create_yamls_flores_gl.py diff --git a/lm_eval/tasks/galician_bench/flores_gl/flores_gl.yaml b/lm_eval/tasks/galician_bench/flores_gl/flores_gl.yaml index 828392437d..806739a9df 100644 --- a/lm_eval/tasks/galician_bench/flores_gl/flores_gl.yaml +++ b/lm_eval/tasks/galician_bench/flores_gl/flores_gl.yaml @@ -19,5 +19,6 @@ task: aggregate_metric_list: - metric: bleu aggregation: mean + weight_by_size: false metadata: version: 1.0 diff --git a/lm_eval/tasks/galician_bench/mgsm_direct_gl.yaml b/lm_eval/tasks/galician_bench/mgsm_direct_gl.yaml index d9d8ca3bb1..f01be3e45e 100644 --- a/lm_eval/tasks/galician_bench/mgsm_direct_gl.yaml +++ b/lm_eval/tasks/galician_bench/mgsm_direct_gl.yaml @@ -1,5 +1,3 @@ -group: - - mgsm_direct task: mgsm_direct_gl dataset_path: proxectonos/mgsm_gl doc_to_target: '{{answer_number|string}}' diff --git a/lm_eval/tasks/galician_bench/paws_gl.yaml b/lm_eval/tasks/galician_bench/paws_gl.yaml index ee28afdbfc..eeb4f5f568 100644 --- a/lm_eval/tasks/galician_bench/paws_gl.yaml +++ b/lm_eval/tasks/galician_bench/paws_gl.yaml @@ -1,5 +1,3 @@ -group: - - pawsx task: paws_gl dataset_path: proxectonos/PAWS-gl dataset_name: null @@ -17,4 +15,4 @@ metric_list: aggregation: mean higher_is_better: true metadata: - version: 1.0 \ No newline at end of file + version: 1.0 diff --git a/lm_eval/tasks/galician_bench/summarization_gl.yaml b/lm_eval/tasks/galician_bench/summarization_gl.yaml index 3cefff62d4..93d1a4d97b 100644 --- a/lm_eval/tasks/galician_bench/summarization_gl.yaml +++ b/lm_eval/tasks/galician_bench/summarization_gl.yaml @@ -18,4 +18,4 @@ metric_list: aggregation: !function utils.rouge1_agg higher_is_better: true metadata: - version: 1.0 \ No newline at end of file + version: 1.0 diff --git a/lm_eval/tasks/galician_bench/truthfulqa_gl_gen.yaml b/lm_eval/tasks/galician_bench/truthfulqa_gl_gen.yaml index 2d2fa18767..b8ac9c2239 100644 --- a/lm_eval/tasks/galician_bench/truthfulqa_gl_gen.yaml +++ b/lm_eval/tasks/galician_bench/truthfulqa_gl_gen.yaml @@ -1,5 +1,4 @@ -group: - - truthfulqa_gl +tag: truthfulqa_gl task: truthfulqa_gl_gen dataset_path: proxectonos/truthfulqa_gl dataset_name: generation diff --git a/lm_eval/tasks/galician_bench/truthfulqa_gl_mc1.yaml b/lm_eval/tasks/galician_bench/truthfulqa_gl_mc1.yaml index beccf24e9f..b4835661ee 100644 --- a/lm_eval/tasks/galician_bench/truthfulqa_gl_mc1.yaml +++ b/lm_eval/tasks/galician_bench/truthfulqa_gl_mc1.yaml @@ -1,5 +1,4 @@ -group: - - truthfulqa_gl +tag: truthfulqa_gl task: truthfulqa_gl_mc1 dataset_path: proxectonos/truthfulqa_gl dataset_name: multiple_choice diff --git a/lm_eval/tasks/galician_bench/truthfulqa_gl_mc2.yaml b/lm_eval/tasks/galician_bench/truthfulqa_gl_mc2.yaml index e1f91867a7..08c4bd6a9a 100644 --- a/lm_eval/tasks/galician_bench/truthfulqa_gl_mc2.yaml +++ b/lm_eval/tasks/galician_bench/truthfulqa_gl_mc2.yaml @@ -1,5 +1,4 @@ -group: - - truthfulqa_gl +tag: truthfulqa_gl include: truthfulqa_gl_mc1.yaml task: truthfulqa_gl_mc2 doc_to_target: 0 diff --git a/lm_eval/tasks/galician_bench/utils.py b/lm_eval/tasks/galician_bench/utils.py index b10043567e..67b0cf69e0 100644 --- a/lm_eval/tasks/galician_bench/utils.py +++ b/lm_eval/tasks/galician_bench/utils.py @@ -1,16 +1,19 @@ import re from itertools import product -import evaluate -import transformers.data.metrics.squad_metrics as squad_metrics -from lm_eval.utils import general_detokenize + import datasets +import evaluate import numpy as np import sacrebleu +import transformers.data.metrics.squad_metrics as squad_metrics from rouge_score import rouge_scorer, scoring +from lm_eval.utils import general_detokenize + def lowercase_first_letter(text): - return text[0].lower() + text[1:] + return text[0].lower() + text[1:] + def process_summarization(dataset): def _process_doc(doc): @@ -18,11 +21,13 @@ def _process_doc(doc): doc["text"] = re.sub(r" +", " ", doc["text"]) doc["summary"] = re.sub(r" +", " ", doc["summary"]) return doc + return dataset.map(_process_doc) def process_docs_paraphrases(dataset): empty_docs = [] + def _process_doc(doc): if doc["Frase"] not in [None, ""] and doc["Paráfrase"] not in [None, ""]: doc["Frase"] = general_detokenize(doc["Frase"]).strip() @@ -36,14 +41,21 @@ def _process_doc(doc): else: empty_docs.append(doc) return doc + if empty_docs != []: len_empty_docs = len(empty_docs) - print(f"Found {len_empty_docs} empty documents out of the {len(dataset)} total docs in the dataset: {empty_docs}") - return dataset.filter(lambda doc: doc["Frase"] not in [None, ""] and doc["Paráfrase"] not in [None, ""]).map(_process_doc) + print( + f"Found {len_empty_docs} empty documents out of the {len(dataset)} total docs in the dataset: {empty_docs}" + ) + return dataset.filter( + lambda doc: doc["Frase"] not in [None, ""] + and doc["Paráfrase"] not in [None, ""] + ).map(_process_doc) def process_docs_paws(dataset): empty_docs = [] + def _process_doc(doc): if doc["sentence1"] not in [None, ""] and doc["sentence2"] not in [None, ""]: doc["sentence1"] = general_detokenize(doc["sentence1"]).strip() @@ -57,11 +69,16 @@ def _process_doc(doc): else: empty_docs.append(doc) return doc + if empty_docs != []: len_empty_docs = len(empty_docs) - print(f"Found {len_empty_docs} empty documents out of the {len(dataset)} total docs in the dataset: {empty_docs}") - return dataset.filter(lambda doc: doc["sentence1"] not in [None, ""] and doc["sentence2"] not in [None, ""]).map(_process_doc) - + print( + f"Found {len_empty_docs} empty documents out of the {len(dataset)} total docs in the dataset: {empty_docs}" + ) + return dataset.filter( + lambda doc: doc["sentence1"] not in [None, ""] + and doc["sentence2"] not in [None, ""] + ).map(_process_doc) def rouge1(items): @@ -78,9 +95,10 @@ def rouge1_agg(items): refs = list(zip(*items))[0] preds = list(zip(*items))[1] rouge_scorer = evaluate.load("rouge") - #import code; code.interact(local=dict(globals(), **locals())) + # import code; code.interact(local=dict(globals(), **locals())) return rouge_scorer.compute(predictions=preds, references=refs)["rouge1"] - + + def process_results_mc2(doc, results): lls, is_greedy = zip(*results) @@ -127,18 +145,28 @@ def process_fn(doc): # Detokenize(remove extra whitespaces) doc["sentence1"] = general_detokenize(doc["sentence1"]).strip() doc["sentence2"] = general_detokenize(doc["sentence2"]).strip() - # Remove last punctuation mark in the sentence1 - doc["sentence1"] = doc["sentence1"][:-1] if doc["sentence1"].endswith((".", ",", "!", "?")) else doc["sentence1"] - # Lowercase the first letter in the sentence2 + # Remove last punctuation mark in the sentence1 + doc["sentence1"] = ( + doc["sentence1"][:-1] + if doc["sentence1"].endswith((".", ",", "!", "?")) + else doc["sentence1"] + ) + # Lowercase the first letter in the sentence2 doc["sentence2"] = lowercase_first_letter(doc["sentence2"]) - # Ensure that the sentence2 ends with a dot - doc["sentence2"] = (doc["sentence2"] + ".") if not doc["sentence2"].endswith(".") else doc["sentence2"] + # Ensure that the sentence2 ends with a dot + doc["sentence2"] = ( + (doc["sentence2"] + ".") + if not doc["sentence2"].endswith(".") + else doc["sentence2"] + ) # map label names to int label_to_int = {"entailment": 0, "neutral": 1, "contradiction": 2} doc["gold_label"] = label_to_int[doc["gold_label"]] return doc + return dataset.map(process_fn) - + + def process_results_gen(doc, results): completion = results[0] true_refs, false_refs = doc["correct_answers"], doc["incorrect_answers"] @@ -259,4 +287,3 @@ def _prepare_summary(summary): aggregator.add_scores(scorer.score(ref, pred)) result = aggregator.aggregate() return {type: result[type].mid.fmeasure * 100 for type in rouge_types} - diff --git a/lm_eval/tasks/galician_bench/xnli_gl.yaml b/lm_eval/tasks/galician_bench/xnli_gl.yaml index 257d7e7b50..c5e1b0fbca 100644 --- a/lm_eval/tasks/galician_bench/xnli_gl.yaml +++ b/lm_eval/tasks/galician_bench/xnli_gl.yaml @@ -1,5 +1,3 @@ -group: - - xnli task: xnli_gl dataset_path: proxectonos/xnli_gl dataset_name: null