From 869998bca77bc3bb63351b739111904dc1d70c20 Mon Sep 17 00:00:00 2001 From: zxcvuser Date: Fri, 19 Jul 2024 14:17:01 +0200 Subject: [PATCH 1/4] Add catalan_bench --- lm_eval/tasks/catalan_bench/README.md | 123 ++++++++++++++++++ .../tasks/catalan_bench/_arc_ca_common_yaml | 21 +++ .../tasks/catalan_bench/_cabreu_common_yaml | 17 +++ .../tasks/catalan_bench/arc_ca_challenge.yaml | 3 + lm_eval/tasks/catalan_bench/arc_ca_easy.yaml | 3 + .../catalan_bench/cabreu_abstractive.yaml | 8 ++ .../catalan_bench/cabreu_extractive.yaml | 8 ++ .../tasks/catalan_bench/cabreu_extreme.yaml | 8 ++ .../tasks/catalan_bench/catalan_bench.yaml | 25 ++++ lm_eval/tasks/catalan_bench/catalanqa.yaml | 25 ++++ lm_eval/tasks/catalan_bench/catcola.yaml | 14 ++ lm_eval/tasks/catalan_bench/copa_ca.yaml | 17 +++ lm_eval/tasks/catalan_bench/coqcat.yaml | 23 ++++ .../flores_ca/_flores_common_yaml | 23 ++++ .../flores_ca/create-yamls_flores_ca.py | 115 ++++++++++++++++ .../catalan_bench/flores_ca/flores_ca-de.yaml | 8 ++ .../catalan_bench/flores_ca/flores_ca-en.yaml | 8 ++ .../catalan_bench/flores_ca/flores_ca-es.yaml | 8 ++ .../catalan_bench/flores_ca/flores_ca-eu.yaml | 8 ++ .../catalan_bench/flores_ca/flores_ca-fr.yaml | 8 ++ .../catalan_bench/flores_ca/flores_ca-gl.yaml | 8 ++ .../catalan_bench/flores_ca/flores_ca-it.yaml | 8 ++ .../catalan_bench/flores_ca/flores_ca-pt.yaml | 8 ++ .../catalan_bench/flores_ca/flores_de-ca.yaml | 8 ++ .../catalan_bench/flores_ca/flores_en-ca.yaml | 8 ++ .../catalan_bench/flores_ca/flores_es-ca.yaml | 8 ++ .../catalan_bench/flores_ca/flores_eu-ca.yaml | 8 ++ .../catalan_bench/flores_ca/flores_fr-ca.yaml | 8 ++ .../catalan_bench/flores_ca/flores_gl-ca.yaml | 8 ++ .../catalan_bench/flores_ca/flores_it-ca.yaml | 8 ++ .../catalan_bench/flores_ca/flores_pt-ca.yaml | 8 ++ .../tasks/catalan_bench/mgsm_direct_ca.yaml | 27 ++++ .../tasks/catalan_bench/openbookqa_ca.yaml | 20 +++ lm_eval/tasks/catalan_bench/parafraseja.yaml | 17 +++ lm_eval/tasks/catalan_bench/paws_ca.yaml | 20 +++ .../phrases_va/_phrases_va_common.yaml | 24 ++++ .../phrases_va/phrases_ca-va.yaml | 7 + .../phrases_va/phrases_va-ca.yaml | 7 + lm_eval/tasks/catalan_bench/piqa_ca.yaml | 21 +++ lm_eval/tasks/catalan_bench/siqa_ca.yaml | 16 +++ lm_eval/tasks/catalan_bench/teca.yaml | 18 +++ lm_eval/tasks/catalan_bench/utils.py | 122 +++++++++++++++++ lm_eval/tasks/catalan_bench/wnli_ca.yaml | 14 ++ lm_eval/tasks/catalan_bench/xnli_ca.yaml | 21 +++ lm_eval/tasks/catalan_bench/xquad_ca.yaml | 25 ++++ .../tasks/catalan_bench/xstorycloze_ca.yaml | 17 +++ 46 files changed, 937 insertions(+) create mode 100644 lm_eval/tasks/catalan_bench/README.md create mode 100644 lm_eval/tasks/catalan_bench/_arc_ca_common_yaml create mode 100644 lm_eval/tasks/catalan_bench/_cabreu_common_yaml create mode 100644 lm_eval/tasks/catalan_bench/arc_ca_challenge.yaml create mode 100644 lm_eval/tasks/catalan_bench/arc_ca_easy.yaml create mode 100644 lm_eval/tasks/catalan_bench/cabreu_abstractive.yaml create mode 100644 lm_eval/tasks/catalan_bench/cabreu_extractive.yaml create mode 100644 lm_eval/tasks/catalan_bench/cabreu_extreme.yaml create mode 100644 lm_eval/tasks/catalan_bench/catalan_bench.yaml create mode 100644 lm_eval/tasks/catalan_bench/catalanqa.yaml create mode 100644 lm_eval/tasks/catalan_bench/catcola.yaml create mode 100644 lm_eval/tasks/catalan_bench/copa_ca.yaml create mode 100644 lm_eval/tasks/catalan_bench/coqcat.yaml create mode 100644 lm_eval/tasks/catalan_bench/flores_ca/_flores_common_yaml create mode 100644 lm_eval/tasks/catalan_bench/flores_ca/create-yamls_flores_ca.py create mode 100644 lm_eval/tasks/catalan_bench/flores_ca/flores_ca-de.yaml create mode 100644 lm_eval/tasks/catalan_bench/flores_ca/flores_ca-en.yaml create mode 100644 lm_eval/tasks/catalan_bench/flores_ca/flores_ca-es.yaml create mode 100644 lm_eval/tasks/catalan_bench/flores_ca/flores_ca-eu.yaml create mode 100644 lm_eval/tasks/catalan_bench/flores_ca/flores_ca-fr.yaml create mode 100644 lm_eval/tasks/catalan_bench/flores_ca/flores_ca-gl.yaml create mode 100644 lm_eval/tasks/catalan_bench/flores_ca/flores_ca-it.yaml create mode 100644 lm_eval/tasks/catalan_bench/flores_ca/flores_ca-pt.yaml create mode 100644 lm_eval/tasks/catalan_bench/flores_ca/flores_de-ca.yaml create mode 100644 lm_eval/tasks/catalan_bench/flores_ca/flores_en-ca.yaml create mode 100644 lm_eval/tasks/catalan_bench/flores_ca/flores_es-ca.yaml create mode 100644 lm_eval/tasks/catalan_bench/flores_ca/flores_eu-ca.yaml create mode 100644 lm_eval/tasks/catalan_bench/flores_ca/flores_fr-ca.yaml create mode 100644 lm_eval/tasks/catalan_bench/flores_ca/flores_gl-ca.yaml create mode 100644 lm_eval/tasks/catalan_bench/flores_ca/flores_it-ca.yaml create mode 100644 lm_eval/tasks/catalan_bench/flores_ca/flores_pt-ca.yaml create mode 100644 lm_eval/tasks/catalan_bench/mgsm_direct_ca.yaml create mode 100644 lm_eval/tasks/catalan_bench/openbookqa_ca.yaml create mode 100644 lm_eval/tasks/catalan_bench/parafraseja.yaml create mode 100644 lm_eval/tasks/catalan_bench/paws_ca.yaml create mode 100644 lm_eval/tasks/catalan_bench/phrases_va/_phrases_va_common.yaml create mode 100644 lm_eval/tasks/catalan_bench/phrases_va/phrases_ca-va.yaml create mode 100644 lm_eval/tasks/catalan_bench/phrases_va/phrases_va-ca.yaml create mode 100644 lm_eval/tasks/catalan_bench/piqa_ca.yaml create mode 100644 lm_eval/tasks/catalan_bench/siqa_ca.yaml create mode 100644 lm_eval/tasks/catalan_bench/teca.yaml create mode 100644 lm_eval/tasks/catalan_bench/utils.py create mode 100644 lm_eval/tasks/catalan_bench/wnli_ca.yaml create mode 100644 lm_eval/tasks/catalan_bench/xnli_ca.yaml create mode 100644 lm_eval/tasks/catalan_bench/xquad_ca.yaml create mode 100644 lm_eval/tasks/catalan_bench/xstorycloze_ca.yaml diff --git a/lm_eval/tasks/catalan_bench/README.md b/lm_eval/tasks/catalan_bench/README.md new file mode 100644 index 0000000000..32b1b0fc34 --- /dev/null +++ b/lm_eval/tasks/catalan_bench/README.md @@ -0,0 +1,123 @@ +# CatalanBench + +### Paper + +CatalanBench is a benchmark for evaluating language models in Catalan tasks. This is, it evaluates the ability of a language model to understand and generate Catalan text. CatalanBench offers a combination of pre-existing, open datasets and datasets developed exclusivelly for this benchmark. All the details of CatalanBench will be published in a paper soon. + +The new evaluation datasets included in CatalanBench are: +| Task | Category | Homepage | +|:-------------:|:-----:|:-----:| +| ARC_ca | Question Answering | https://huggingface.co/datasets/projecte-aina/arc_ca | +| MGSM_ca | Math | https://huggingface.co/datasets/projecte-aina/mgsm_ca | +| OpenBookQA_ca | Question Answering | https://huggingface.co/datasets/projecte-aina/openbookqa_ca | +| Parafraseja | Paraphrasing | https://huggingface.co/datasets/projecte-aina/Parafraseja | +| PIQA_ca | Question Answering | https://huggingface.co/datasets/projecte-aina/piqa_ca | +| SIQA_ca | Question Answering | https://huggingface.co/datasets/projecte-aina/siqa_ca | +| XStoryCloze_ca | Commonsense Reasoning | https://huggingface.co/datasets/projecte-aina/xstorycloze_ca | + +The datasets included in CatalanBench that have been made public in previous pubications are: + +| Task | Category | Paper title | Homepage | +|:-------------:|:-----:|:-------------:|:-----:| +| Belebele_ca | Reading Comprehension | [The Belebele Benchmark: a Parallel Reading Comprehension Dataset in 122 Language Variants](https://arxiv.org/abs/2308.16884) | https://huggingface.co/datasets/facebook/belebele | +| caBREU | Summarization | [Building a Data Infrastructure for a Mid-Resource Language: The Case of Catalan](https://aclanthology.org/2024.lrec-main.231/) | https://huggingface.co/datasets/projecte-aina/caBreu | +| CatalanQA | Question Answering | [Building a Data Infrastructure for a Mid-Resource Language: The Case of Catalan](https://aclanthology.org/2024.lrec-main.231/) | https://huggingface.co/datasets/projecte-aina/catalanqa | +| CatCoLA | Linguistic Acceptability | CatCoLA: Catalan Corpus of Linguistic Acceptability | https://huggingface.co/datasets/nbel/CatCoLA | +| COPA-ca | Commonsense Reasoning | [Building a Data Infrastructure for a Mid-Resource Language: The Case of Catalan](https://aclanthology.org/2024.lrec-main.231/) | https://huggingface.co/datasets/projecte-aina/COPA-ca | +| CoQCat | Question Answering | [Building a Data Infrastructure for a Mid-Resource Language: The Case of Catalan](https://aclanthology.org/2024.lrec-main.231/) | https://huggingface.co/datasets/projecte-aina/CoQCat | +| FLORES_ca | Translation | [The FLORES-101 Evaluation Benchmark for Low-Resource and Multilingual Machine Translation](https://arxiv.org/abs/2106.03193) | https://huggingface.co/datasets/facebook/flores | +| PAWS-ca | Paraphrasing | [Building a Data Infrastructure for a Mid-Resource Language: The Case of Catalan](https://aclanthology.org/2024.lrec-main.231/) | https://huggingface.co/datasets/projecte-aina/PAWS-ca | +| TE-ca | Natural Language Inference | [Building a Data Infrastructure for a Mid-Resource Language: The Case of Catalan](https://aclanthology.org/2024.lrec-main.231/) | https://huggingface.co/datasets/projecte-aina/teca | +| VeritasQA_ca | Truthfulness | VeritasQA: A Truthfulness Benchmark Aimed at Multilingual Transferability | TBA | +| WNLI-ca | Natural Language Inference | [Building a Data Infrastructure for a Mid-Resource Language: The Case of Catalan](https://aclanthology.org/2024.lrec-main.231/) | https://huggingface.co/datasets/projecte-aina/wnli-ca | +| XNLI-ca | Natural Language Inference | [Building a Data Infrastructure for a Mid-Resource Language: The Case of Catalan](https://aclanthology.org/2024.lrec-main.231/) | https://huggingface.co/datasets/projecte-aina/xnli-ca | +| XQuAD-ca | Question Answering | [Building a Data Infrastructure for a Mid-Resource Language: The Case of Catalan](https://aclanthology.org/2024.lrec-main.231/) | https://huggingface.co/datasets/projecte-aina/xquad-ca | + + +### Citation +Paper for CatalanBench coming soon. + + + +### Groups and Tasks + +#### Groups + +- `catalan_bench`: All tasks included in CatalanBench. +- `cabreu`: Three CaBREU tasks for each type of summary (extractive, abstractive and extreme). +- `flores_ca`: All FLORES translation tasks from or to Catalan. +- `phrases_ca`: Two Phrases_va tasks for language adaptation between Catalan and Valencian. + + +#### Tasks + +The following tasks evaluate tasks on CatalanBench dataset using various scoring methods. + - `arc_ca_challenge` + - `arc_ca_easy` + - `belebele_cat_Latn` + - `cabreu` + - `catalanqa` + - `catcola` + - `copa_ca` + - `coqcat` + - `flores_ca` + - `flores_ca-de` + - `flores_ca-en` + - `flores_ca-es` + - `flores_ca-eu` + - `flores_ca-fr` + - `flores_ca-gl` + - `flores_ca-it` + - `flores_ca-pt` + - `flores_de-ca` + - `flores_en-ca` + - `flores_es-ca` + - `flores_eu-ca` + - `flores_fr-ca` + - `flores_gl-ca` + - `flores_it-ca` + - `flores_pt-ca` + - `mgsm_direct_ca` + - `openbookqa_ca` + - `parafraseja` + - `paws_ca` + - `phrases_ca` + - `piqa_ca` + - `siqa_ca` + - `teca` + - `veritasqa_gen_ca` + - `veritasqa_mc1_ca` + - `veritasqa_mc2_ca` + - `wnli_ca` + - `xnli_ca` + - `xquad_ca` + - `xstorycloze_ca` + +Some of these tasks are taken from benchmarks already available in LM Evaluation Harness. These are: +- `belebele_cat_Latn`: Belebele Catalan +- `veritasqa_gen_ca`: VeritasQA Catalan +- `veritasqa_mc1_ca`: VeritasQA Catalan +- `veritasqa_mc2_ca`: VeritasQA Catalan + + +### Checklist + +* [x] Is the task an existing benchmark in the literature? + * [ ] Have you referenced the original paper that introduced the task? + * [ ] If yes, does the original paper provide a reference implementation? + * [ ] Yes, original implementation contributed by author of the benchmark + +If other tasks on this dataset are already supported: +* [ ] Is the "Main" variant of this task clearly denoted? +* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates? +* [ ] Have you noted which, if any, published evaluation setups are matched by this variant? diff --git a/lm_eval/tasks/catalan_bench/_arc_ca_common_yaml b/lm_eval/tasks/catalan_bench/_arc_ca_common_yaml new file mode 100644 index 0000000000..38dfc08b46 --- /dev/null +++ b/lm_eval/tasks/catalan_bench/_arc_ca_common_yaml @@ -0,0 +1,21 @@ +group: + - ai2_arc +dataset_path: projecte-aina/arc_ca +output_type: multiple_choice +training_split: null +validation_split: validation +test_split: test +doc_to_text: "Pregunta: {{question}}\nResposta:" +doc_to_target: "{{choices.label.index(answerKey)}}" +doc_to_choice: "{{choices.text}}" +should_decontaminate: true +doc_to_decontamination_query: "Pregunta: {{question}}\nResposta:" +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/catalan_bench/_cabreu_common_yaml b/lm_eval/tasks/catalan_bench/_cabreu_common_yaml new file mode 100644 index 0000000000..85c51a18b6 --- /dev/null +++ b/lm_eval/tasks/catalan_bench/_cabreu_common_yaml @@ -0,0 +1,17 @@ +group: cabreu +dataset_path: projecte-aina/caBreu +dataset_name: null +output_type: generate_until +test_split: test +training_split: train +validation_split: validation +process_docs: !function utils.process_doc_cabreu +metric_list: + - metric: bleu + aggregation: bleu + higher_is_better: true + - metric: !function utils.rouge1 + aggregation: !function utils.rouge1_agg + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/catalan_bench/arc_ca_challenge.yaml b/lm_eval/tasks/catalan_bench/arc_ca_challenge.yaml new file mode 100644 index 0000000000..9d7a9c8423 --- /dev/null +++ b/lm_eval/tasks/catalan_bench/arc_ca_challenge.yaml @@ -0,0 +1,3 @@ +task: arc_ca_challenge +dataset_name: ARC-Challenge +include: _arc_ca_common_yaml diff --git a/lm_eval/tasks/catalan_bench/arc_ca_easy.yaml b/lm_eval/tasks/catalan_bench/arc_ca_easy.yaml new file mode 100644 index 0000000000..67b28fd626 --- /dev/null +++ b/lm_eval/tasks/catalan_bench/arc_ca_easy.yaml @@ -0,0 +1,3 @@ +task: arc_ca_easy +dataset_name: ARC-Easy +include: _arc_ca_common_yaml diff --git a/lm_eval/tasks/catalan_bench/cabreu_abstractive.yaml b/lm_eval/tasks/catalan_bench/cabreu_abstractive.yaml new file mode 100644 index 0000000000..930ba28a52 --- /dev/null +++ b/lm_eval/tasks/catalan_bench/cabreu_abstractive.yaml @@ -0,0 +1,8 @@ +include: _cabreu_common_yaml +task: cabreu_abstractive +description: "Examina el text següent i genera'n un resum abstractiu, expressant el significat del text original d'una manera més natural i concisa.\n" +doc_to_text: >- + Text: {{content}} + + Resum: +doc_to_target: '{{summaries["abstractive"]["a1"]}}' diff --git a/lm_eval/tasks/catalan_bench/cabreu_extractive.yaml b/lm_eval/tasks/catalan_bench/cabreu_extractive.yaml new file mode 100644 index 0000000000..e5f3dd4dd0 --- /dev/null +++ b/lm_eval/tasks/catalan_bench/cabreu_extractive.yaml @@ -0,0 +1,8 @@ +include: _cabreu_common_yaml +task: cabreu_extractive +description: "Examina el text següent i genera'n un resum extractiu, utilitzant les frases o oracions més rellevants del text original.\n" +doc_to_text: >- + Text: {{content}} + + Resum: +doc_to_target: '{{summaries["extractive"]["a1"]}}' diff --git a/lm_eval/tasks/catalan_bench/cabreu_extreme.yaml b/lm_eval/tasks/catalan_bench/cabreu_extreme.yaml new file mode 100644 index 0000000000..98efbe9cd4 --- /dev/null +++ b/lm_eval/tasks/catalan_bench/cabreu_extreme.yaml @@ -0,0 +1,8 @@ +include: _cabreu_common_yaml +task: cabreu_extreme +description: "Examina el text següent i genera'n un resum que sigui el més concís possible i que preservi el significat del text original.\n" +doc_to_text: >- + Text: {{content}} + + Resum: +doc_to_target: '{{summaries["extreme"]["a1"]}}' diff --git a/lm_eval/tasks/catalan_bench/catalan_bench.yaml b/lm_eval/tasks/catalan_bench/catalan_bench.yaml new file mode 100644 index 0000000000..1f1f09ece2 --- /dev/null +++ b/lm_eval/tasks/catalan_bench/catalan_bench.yaml @@ -0,0 +1,25 @@ +group: catalan_bench +task: + - belebele_cat_Latn + - xnli_ca + - catcola + - copa_ca + - openbookqa_ca + - parafraseja + - paws_ca + - piqa_ca + - siqa_ca + - teca + - wnli_ca + - arc_ca_easy + - arc_ca_challenge + - xstorycloze_ca + - xquad_ca + - catalanqa + - coqcat + - flores_ca + - cabreu + - mgsm_direct_ca + - phrases_va +metadata: + version: 1.0 diff --git a/lm_eval/tasks/catalan_bench/catalanqa.yaml b/lm_eval/tasks/catalan_bench/catalanqa.yaml new file mode 100644 index 0000000000..8861794e94 --- /dev/null +++ b/lm_eval/tasks/catalan_bench/catalanqa.yaml @@ -0,0 +1,25 @@ +task: catalanqa +dataset_path: projecte-aina/catalanqa +dataset_name: null +output_type: generate_until +training_split: train +validation_split: validation +test_split: test +doc_to_text: "Context: {{context}}\n\nPregunta: {{question}}\n\nResposta:" +doc_to_target: '{{answers[0]["text"]}}' +target_delimiter: ' ' +process_results: !function utils.process_results_qa +generation_kwargs: + until: + - "\n" + do_sample: false + temperature: 0.0 +metric_list: + - metric: exact_match + aggregation: mean + higher_is_better: true + - metric: f1 + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 \ No newline at end of file diff --git a/lm_eval/tasks/catalan_bench/catcola.yaml b/lm_eval/tasks/catalan_bench/catcola.yaml new file mode 100644 index 0000000000..121b5e7f48 --- /dev/null +++ b/lm_eval/tasks/catalan_bench/catcola.yaml @@ -0,0 +1,14 @@ +task: catcola +dataset_path: nbel/CatCoLA +output_type: multiple_choice +training_split: train +validation_split: validation +test_split: null +doc_to_text: "{{Sentence}}\nPregunta: Té sentit aquesta frase?\nResposta:" +doc_to_target: label +doc_to_choice: ["no", "sí"] +metric_list: + - metric: mcc + - metric: acc +metadata: + version: 1.0 diff --git a/lm_eval/tasks/catalan_bench/copa_ca.yaml b/lm_eval/tasks/catalan_bench/copa_ca.yaml new file mode 100644 index 0000000000..c7ca3f11a1 --- /dev/null +++ b/lm_eval/tasks/catalan_bench/copa_ca.yaml @@ -0,0 +1,17 @@ +task: copa_ca +dataset_path: projecte-aina/COPA-ca +dataset_name: null +output_type: multiple_choice +training_split: train +validation_split: validation +test_split: test +process_docs: !function utils.process_docs_copa_ca +doc_to_text: '{{premise[:-1].strip() + " " + {"cause": "perquè", "effect": "i per tant"}[question]}}' +doc_to_target: '{{choice1 if label == 0 else choice2}}' +doc_to_choice: '{{[choice1, choice2]}}' +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 \ No newline at end of file diff --git a/lm_eval/tasks/catalan_bench/coqcat.yaml b/lm_eval/tasks/catalan_bench/coqcat.yaml new file mode 100644 index 0000000000..12ab7d7cdf --- /dev/null +++ b/lm_eval/tasks/catalan_bench/coqcat.yaml @@ -0,0 +1,23 @@ +task: coqcat +dataset_path: projecte-aina/CoQCat +output_type: generate_until +training_split: train +validation_split: validation +test_split: test +doc_to_text: '{{story+"\n\n"}}{% for i in range(questions|length-1) %}{{"Q: "+questions[i]+"\n\n"+"A: "+answers["input_text"][i]+"\n\n"}}{% endfor %}{{"Q: "+questions[-1]+"\n\n"+"A:"}}' +doc_to_target: '{{ answers["input_text"][questions|length - 1] }}' +process_results: !function utils.process_results_coqcat +should_decontaminate: true +doc_to_decontamination_query: "{{story}} {{question.input_text|join('\n')}}" +generation_kwargs: + until: + - "\nQ:" +metric_list: + - metric: "em" + aggregation: mean + higher_is_better: true + - metric: "f1" + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 \ No newline at end of file diff --git a/lm_eval/tasks/catalan_bench/flores_ca/_flores_common_yaml b/lm_eval/tasks/catalan_bench/flores_ca/_flores_common_yaml new file mode 100644 index 0000000000..075a66b5f4 --- /dev/null +++ b/lm_eval/tasks/catalan_bench/flores_ca/_flores_common_yaml @@ -0,0 +1,23 @@ +dataset_path: facebook/flores +dataset_name: all +output_type: generate_until +training_split: dev +validation_split: dev +test_split: devtest +fewshot_split: dev +target_delimiter: '' +generation_kwargs: + until: + - "\n" +metric_list: + - metric: bleu + aggregation: bleu + higher_is_better: true + - metric: ter + aggregation: ter + higher_is_better: false + - metric: chrf + aggregation: chrf + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/catalan_bench/flores_ca/create-yamls_flores_ca.py b/lm_eval/tasks/catalan_bench/flores_ca/create-yamls_flores_ca.py new file mode 100644 index 0000000000..eeadc3ed76 --- /dev/null +++ b/lm_eval/tasks/catalan_bench/flores_ca/create-yamls_flores_ca.py @@ -0,0 +1,115 @@ +""" +Script to generate task YAMLs for the FLORES-200 dataset. +Based on `tasks/translation/utils.py`. +""" + +import argparse +import yaml +from langcodes import * +from itertools import * + +# utils +flatten = lambda l: list(itertools.chain(*l)) + +# constants +_LANGUAGES = [ +"ace_Arab", "bam_Latn", "dzo_Tibt", "hin_Deva", "khm_Khmr", "mag_Deva", "pap_Latn", "sot_Latn", "tur_Latn", +"ace_Latn", "ban_Latn", "ell_Grek", "hne_Deva", "kik_Latn", "mai_Deva", "pbt_Arab", "spa_Latn", "twi_Latn", +"acm_Arab", "bel_Cyrl", "eng_Latn", "hrv_Latn", "kin_Latn", "mal_Mlym", "pes_Arab", "srd_Latn", "tzm_Tfng", +"acq_Arab", "bem_Latn", "epo_Latn", "hun_Latn", "kir_Cyrl", "mar_Deva", "plt_Latn", "srp_Cyrl", "uig_Arab", +"aeb_Arab", "ben_Beng", "est_Latn", "hye_Armn", "kmb_Latn", "min_Arab", "pol_Latn", "ssw_Latn", "ukr_Cyrl", +"afr_Latn", "bho_Deva", "eus_Latn", "ibo_Latn", "kmr_Latn", "min_Latn", "por_Latn", "sun_Latn", "umb_Latn", +"ajp_Arab", "bjn_Arab", "ewe_Latn", "ilo_Latn", "knc_Arab", "mkd_Cyrl", "prs_Arab", "swe_Latn", "urd_Arab", +"aka_Latn", "bjn_Latn", "fao_Latn", "ind_Latn", "knc_Latn", "mlt_Latn", "quy_Latn", "swh_Latn", "uzn_Latn", +"als_Latn", "bod_Tibt", "fij_Latn", "isl_Latn", "kon_Latn", "mni_Beng", "ron_Latn", "szl_Latn", "vec_Latn", +"amh_Ethi", "bos_Latn", "fin_Latn", "ita_Latn", "kor_Hang", "mos_Latn", "run_Latn", "tam_Taml", "vie_Latn", +"apc_Arab", "bug_Latn", "fon_Latn", "jav_Latn", "lao_Laoo", "mri_Latn", "rus_Cyrl", "taq_Latn", "war_Latn", +"arb_Arab", "bul_Cyrl", "fra_Latn", "jpn_Jpan", "lij_Latn", "mya_Mymr", "sag_Latn", "taq_Tfng", "wol_Latn", +"arb_Latn", "cat_Latn", "fur_Latn", "kab_Latn", "lim_Latn", "nld_Latn", "san_Deva", "tat_Cyrl", "xho_Latn", +"ars_Arab", "ceb_Latn", "fuv_Latn", "kac_Latn", "lin_Latn", "nno_Latn", "sat_Olck", "tel_Telu", "ydd_Hebr", +"ary_Arab", "ces_Latn", "gaz_Latn", "kam_Latn", "lit_Latn", "nob_Latn", "scn_Latn", "tgk_Cyrl", "yor_Latn", +"arz_Arab", "cjk_Latn", "gla_Latn", "kan_Knda", "lmo_Latn", "npi_Deva", "shn_Mymr", "tgl_Latn", "yue_Hant", +"asm_Beng", "ckb_Arab", "gle_Latn", "kas_Arab", "ltg_Latn", "nso_Latn", "sin_Sinh", "tha_Thai", "zho_Hans", +"ast_Latn", "crh_Latn", "glg_Latn", "kas_Deva", "ltz_Latn", "nus_Latn", "slk_Latn", "tir_Ethi", "zho_Hant", +"awa_Deva", "cym_Latn", "grn_Latn", "kat_Geor", "lua_Latn", "nya_Latn", "slv_Latn", "tpi_Latn", "zsm_Latn", +"ayr_Latn", "dan_Latn", "guj_Gujr", "kaz_Cyrl", "lug_Latn", "oci_Latn", "smo_Latn", "tsn_Latn", "zul_Latn", +"azb_Arab", "deu_Latn", "hat_Latn", "kbp_Latn", "luo_Latn", "ory_Orya", "sna_Latn", "tso_Latn", +"azj_Latn", "dik_Latn", "hau_Latn", "kea_Latn", "lus_Latn", "pag_Latn", "snd_Arab", "tuk_Latn", +"bak_Cyrl", "dyu_Latn", "heb_Hebr", "khk_Cyrl", "lvs_Latn", "pan_Guru", "som_Latn", "tum_Latn" +] +LANGUAGE_PAIRS = [(a, b) for idx, a in enumerate(_LANGUAGES) for b in _LANGUAGES[idx + 1:]] + +LANGUAGES_OF_INTEREST = ["cat_Latn", "spa_Latn", "eng_Latn", "glg_Latn", "eus_Latn", "ita_Latn", "deu_Latn", "por_Latn", "fra_Latn"] +MAIN_LANG = "cat_Latn" +LANGUAGE_PAIRS = [(a, b) for (a, b) in LANGUAGE_PAIRS if a in LANGUAGES_OF_INTEREST and b in LANGUAGES_OF_INTEREST and "cat_Latn" in (a, b)] + +# auxiliary functions + +code_to_language_name = lambda code: Language.make(language=Language.get(code)["language"]).display_name() +code_to_short_name = lambda code: Language.get(code)["language"] +jinja_var = lambda s: "{{" + s + "}}" # wrapper to avoid having to escape { } in format strings + +def doc_to_text(src: str, tgt: str) -> str: + src_name, tgt_name = map(code_to_language_name, [src, tgt]) + + return f"""\ +{src_name} sentence: {jinja_var('sentence_' + src)} +{tgt_name} sentence:""" + +def doc_to_target(tgt: str) -> str: + + return f"{jinja_var('sentence_' + tgt)}" + +# main function + +def gen_lang_yamls(output_dir: str, overwrite: bool) -> None: + """ + Generate a YAML file for each translation direction. + """ + + err = [] + for src, tgt in LANGUAGE_PAIRS: + + # do both translation directions for each lang pair + for src, tgt in [(src, tgt), (tgt, src)]: + lang_pair_name = f"{code_to_short_name(src)}-{code_to_short_name(tgt)}" + yaml_file_name = f"flores_{lang_pair_name}.yaml" + + try: + with open( f"{output_dir}/{yaml_file_name}", "w" if overwrite else "x", encoding="utf-8") as outfile: + print(f"Creating {yaml_file_name}...") + outfile.write("# File generated by `create-yamls.py`\n") + yaml.dump( + { +# "group": [f"{BENCH_NAME}_bench", f"{BENCH_NAME}_bench_flores"], + "group": "flores_ca", + "include": "_flores_common_yaml", + "task": f"flores_{lang_pair_name}", + "doc_to_text": doc_to_text(src, tgt), + "doc_to_target": doc_to_target(tgt), + }, + outfile, + sort_keys=False, + ) + + except FileExistsError: + err.append(yaml_file_name) + + if len(err) > 0: + raise FileExistsError( + "Files were not created because they already exist:" + f" {', '.join(err)}" + "\nUse flag --overwrite to overwrite them." + ) + + +def main() -> None: + parser = argparse.ArgumentParser() + parser.add_argument("--overwrite", default=False, action="store_true", help="Overwrite files if they already exist") + parser.add_argument( "--output-dir", default=".", help="Directory to write yaml files to" ) + args = parser.parse_args() + + gen_lang_yamls(output_dir=args.output_dir, overwrite=args.overwrite) + +if __name__ == "__main__": + main() diff --git a/lm_eval/tasks/catalan_bench/flores_ca/flores_ca-de.yaml b/lm_eval/tasks/catalan_bench/flores_ca/flores_ca-de.yaml new file mode 100644 index 0000000000..0bb415a04b --- /dev/null +++ b/lm_eval/tasks/catalan_bench/flores_ca/flores_ca-de.yaml @@ -0,0 +1,8 @@ +# File generated by `create-yamls.py` +group: flores_ca +include: _flores_common_yaml +task: flores_ca-de +doc_to_text: 'Catalan sentence: {{sentence_cat_Latn}} + + German sentence:' +doc_to_target: '{{sentence_deu_Latn}}' diff --git a/lm_eval/tasks/catalan_bench/flores_ca/flores_ca-en.yaml b/lm_eval/tasks/catalan_bench/flores_ca/flores_ca-en.yaml new file mode 100644 index 0000000000..3aa1351307 --- /dev/null +++ b/lm_eval/tasks/catalan_bench/flores_ca/flores_ca-en.yaml @@ -0,0 +1,8 @@ +# File generated by `create-yamls.py` +group: flores_ca +include: _flores_common_yaml +task: flores_ca-en +doc_to_text: 'Catalan sentence: {{sentence_cat_Latn}} + + English sentence:' +doc_to_target: '{{sentence_eng_Latn}}' diff --git a/lm_eval/tasks/catalan_bench/flores_ca/flores_ca-es.yaml b/lm_eval/tasks/catalan_bench/flores_ca/flores_ca-es.yaml new file mode 100644 index 0000000000..1b0cf7c7f0 --- /dev/null +++ b/lm_eval/tasks/catalan_bench/flores_ca/flores_ca-es.yaml @@ -0,0 +1,8 @@ +# File generated by `create-yamls.py` +group: flores_ca +include: _flores_common_yaml +task: flores_ca-es +doc_to_text: 'Catalan sentence: {{sentence_cat_Latn}} + + Spanish sentence:' +doc_to_target: '{{sentence_spa_Latn}}' diff --git a/lm_eval/tasks/catalan_bench/flores_ca/flores_ca-eu.yaml b/lm_eval/tasks/catalan_bench/flores_ca/flores_ca-eu.yaml new file mode 100644 index 0000000000..fd540c6425 --- /dev/null +++ b/lm_eval/tasks/catalan_bench/flores_ca/flores_ca-eu.yaml @@ -0,0 +1,8 @@ +# File generated by `create-yamls.py` +group: flores_ca +include: _flores_common_yaml +task: flores_ca-eu +doc_to_text: 'Catalan sentence: {{sentence_cat_Latn}} + + Basque sentence:' +doc_to_target: '{{sentence_eus_Latn}}' diff --git a/lm_eval/tasks/catalan_bench/flores_ca/flores_ca-fr.yaml b/lm_eval/tasks/catalan_bench/flores_ca/flores_ca-fr.yaml new file mode 100644 index 0000000000..5aa495781d --- /dev/null +++ b/lm_eval/tasks/catalan_bench/flores_ca/flores_ca-fr.yaml @@ -0,0 +1,8 @@ +# File generated by `create-yamls.py` +group: flores_ca +include: _flores_common_yaml +task: flores_ca-fr +doc_to_text: 'Catalan sentence: {{sentence_cat_Latn}} + + French sentence:' +doc_to_target: '{{sentence_fra_Latn}}' diff --git a/lm_eval/tasks/catalan_bench/flores_ca/flores_ca-gl.yaml b/lm_eval/tasks/catalan_bench/flores_ca/flores_ca-gl.yaml new file mode 100644 index 0000000000..d33cdd505e --- /dev/null +++ b/lm_eval/tasks/catalan_bench/flores_ca/flores_ca-gl.yaml @@ -0,0 +1,8 @@ +# File generated by `create-yamls.py` +group: flores_ca +include: _flores_common_yaml +task: flores_ca-gl +doc_to_text: 'Catalan sentence: {{sentence_cat_Latn}} + + Galician sentence:' +doc_to_target: '{{sentence_glg_Latn}}' diff --git a/lm_eval/tasks/catalan_bench/flores_ca/flores_ca-it.yaml b/lm_eval/tasks/catalan_bench/flores_ca/flores_ca-it.yaml new file mode 100644 index 0000000000..61431e8da2 --- /dev/null +++ b/lm_eval/tasks/catalan_bench/flores_ca/flores_ca-it.yaml @@ -0,0 +1,8 @@ +# File generated by `create-yamls.py` +group: flores_ca +include: _flores_common_yaml +task: flores_ca-it +doc_to_text: 'Catalan sentence: {{sentence_cat_Latn}} + + Italian sentence:' +doc_to_target: '{{sentence_ita_Latn}}' diff --git a/lm_eval/tasks/catalan_bench/flores_ca/flores_ca-pt.yaml b/lm_eval/tasks/catalan_bench/flores_ca/flores_ca-pt.yaml new file mode 100644 index 0000000000..3b0e70b550 --- /dev/null +++ b/lm_eval/tasks/catalan_bench/flores_ca/flores_ca-pt.yaml @@ -0,0 +1,8 @@ +# File generated by `create-yamls.py` +group: flores_ca +include: _flores_common_yaml +task: flores_ca-pt +doc_to_text: 'Catalan sentence: {{sentence_cat_Latn}} + + Portuguese sentence:' +doc_to_target: '{{sentence_por_Latn}}' diff --git a/lm_eval/tasks/catalan_bench/flores_ca/flores_de-ca.yaml b/lm_eval/tasks/catalan_bench/flores_ca/flores_de-ca.yaml new file mode 100644 index 0000000000..363bd62ced --- /dev/null +++ b/lm_eval/tasks/catalan_bench/flores_ca/flores_de-ca.yaml @@ -0,0 +1,8 @@ +# File generated by `create-yamls.py` +group: flores_ca +include: _flores_common_yaml +task: flores_de-ca +doc_to_text: 'German sentence: {{sentence_deu_Latn}} + + Catalan sentence:' +doc_to_target: '{{sentence_cat_Latn}}' diff --git a/lm_eval/tasks/catalan_bench/flores_ca/flores_en-ca.yaml b/lm_eval/tasks/catalan_bench/flores_ca/flores_en-ca.yaml new file mode 100644 index 0000000000..81706f6dc2 --- /dev/null +++ b/lm_eval/tasks/catalan_bench/flores_ca/flores_en-ca.yaml @@ -0,0 +1,8 @@ +# File generated by `create-yamls.py` +group: flores_ca +include: _flores_common_yaml +task: flores_en-ca +doc_to_text: 'English sentence: {{sentence_eng_Latn}} + + Catalan sentence:' +doc_to_target: '{{sentence_cat_Latn}}' diff --git a/lm_eval/tasks/catalan_bench/flores_ca/flores_es-ca.yaml b/lm_eval/tasks/catalan_bench/flores_ca/flores_es-ca.yaml new file mode 100644 index 0000000000..a74437e392 --- /dev/null +++ b/lm_eval/tasks/catalan_bench/flores_ca/flores_es-ca.yaml @@ -0,0 +1,8 @@ +# File generated by `create-yamls.py` +group: flores_ca +include: _flores_common_yaml +task: flores_es-ca +doc_to_text: 'Spanish sentence: {{sentence_spa_Latn}} + + Catalan sentence:' +doc_to_target: '{{sentence_cat_Latn}}' diff --git a/lm_eval/tasks/catalan_bench/flores_ca/flores_eu-ca.yaml b/lm_eval/tasks/catalan_bench/flores_ca/flores_eu-ca.yaml new file mode 100644 index 0000000000..dcf6e3760f --- /dev/null +++ b/lm_eval/tasks/catalan_bench/flores_ca/flores_eu-ca.yaml @@ -0,0 +1,8 @@ +# File generated by `create-yamls.py` +group: flores_ca +include: _flores_common_yaml +task: flores_eu-ca +doc_to_text: 'Basque sentence: {{sentence_eus_Latn}} + + Catalan sentence:' +doc_to_target: '{{sentence_cat_Latn}}' diff --git a/lm_eval/tasks/catalan_bench/flores_ca/flores_fr-ca.yaml b/lm_eval/tasks/catalan_bench/flores_ca/flores_fr-ca.yaml new file mode 100644 index 0000000000..a5abc7ead8 --- /dev/null +++ b/lm_eval/tasks/catalan_bench/flores_ca/flores_fr-ca.yaml @@ -0,0 +1,8 @@ +# File generated by `create-yamls.py` +group: flores_ca +include: _flores_common_yaml +task: flores_fr-ca +doc_to_text: 'French sentence: {{sentence_fra_Latn}} + + Catalan sentence:' +doc_to_target: '{{sentence_cat_Latn}}' diff --git a/lm_eval/tasks/catalan_bench/flores_ca/flores_gl-ca.yaml b/lm_eval/tasks/catalan_bench/flores_ca/flores_gl-ca.yaml new file mode 100644 index 0000000000..78c554086f --- /dev/null +++ b/lm_eval/tasks/catalan_bench/flores_ca/flores_gl-ca.yaml @@ -0,0 +1,8 @@ +# File generated by `create-yamls.py` +group: flores_ca +include: _flores_common_yaml +task: flores_gl-ca +doc_to_text: 'Galician sentence: {{sentence_glg_Latn}} + + Catalan sentence:' +doc_to_target: '{{sentence_cat_Latn}}' diff --git a/lm_eval/tasks/catalan_bench/flores_ca/flores_it-ca.yaml b/lm_eval/tasks/catalan_bench/flores_ca/flores_it-ca.yaml new file mode 100644 index 0000000000..128834d976 --- /dev/null +++ b/lm_eval/tasks/catalan_bench/flores_ca/flores_it-ca.yaml @@ -0,0 +1,8 @@ +# File generated by `create-yamls.py` +group: flores_ca +include: _flores_common_yaml +task: flores_it-ca +doc_to_text: 'Italian sentence: {{sentence_ita_Latn}} + + Catalan sentence:' +doc_to_target: '{{sentence_cat_Latn}}' diff --git a/lm_eval/tasks/catalan_bench/flores_ca/flores_pt-ca.yaml b/lm_eval/tasks/catalan_bench/flores_ca/flores_pt-ca.yaml new file mode 100644 index 0000000000..dd355b797c --- /dev/null +++ b/lm_eval/tasks/catalan_bench/flores_ca/flores_pt-ca.yaml @@ -0,0 +1,8 @@ +# File generated by `create-yamls.py` +group: flores_ca +include: _flores_common_yaml +task: flores_pt-ca +doc_to_text: 'Portuguese sentence: {{sentence_por_Latn}} + + Catalan sentence:' +doc_to_target: '{{sentence_cat_Latn}}' diff --git a/lm_eval/tasks/catalan_bench/mgsm_direct_ca.yaml b/lm_eval/tasks/catalan_bench/mgsm_direct_ca.yaml new file mode 100644 index 0000000000..754d5b91ce --- /dev/null +++ b/lm_eval/tasks/catalan_bench/mgsm_direct_ca.yaml @@ -0,0 +1,27 @@ +group: + - mgsm_direct +task: mgsm_direct_ca +dataset_path: projecte-aina/mgsm_ca +doc_to_target: '{{answer_number|string}}' +doc_to_text: '{% if answer != None %}{{question + "\nResposta: "}}{% else %}{{"Pregunta: " + question + "\nResposta: "}}{% endif %}' +output_type: generate_until +training_split: train +test_split: test +target_delimiter: "" +generation_kwargs: + until: + - "\n\n" + - "\n" +filter_list: + - name: remove_whitespace + filter: + - function: remove_whitespace + - function: take_first +metric_list: + - metric: exact_match + aggregation: mean + higher_is_better: true + ignore_case: true + ignore_punctuation: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/catalan_bench/openbookqa_ca.yaml b/lm_eval/tasks/catalan_bench/openbookqa_ca.yaml new file mode 100644 index 0000000000..00a1f03d4f --- /dev/null +++ b/lm_eval/tasks/catalan_bench/openbookqa_ca.yaml @@ -0,0 +1,20 @@ +task: openbookqa_ca +dataset_path: projecte-aina/openbookqa_ca +output_type: multiple_choice +training_split: null +validation_split: validation +test_split: test +doc_to_text: question_stem +doc_to_target: "{{choices.label.index(answerKey.lstrip())}}" +doc_to_choice: "{{choices.text}}" +should_decontaminate: true +doc_to_decontamination_query: question_stem +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 \ No newline at end of file diff --git a/lm_eval/tasks/catalan_bench/parafraseja.yaml b/lm_eval/tasks/catalan_bench/parafraseja.yaml new file mode 100644 index 0000000000..208e3e373f --- /dev/null +++ b/lm_eval/tasks/catalan_bench/parafraseja.yaml @@ -0,0 +1,17 @@ +task: parafraseja +dataset_path: projecte-aina/Parafraseja +output_type: multiple_choice +dataset_name: null +test_split: test +training_split: train +validation_split: validation +doc_to_choice: '{{[sentence1+", veritat? No, "+sentence2, sentence1+", veritat? Sí, "+sentence2]}}' +process_docs: !function utils.process_docs_paraphrases +doc_to_text: '' +doc_to_target: label +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 \ No newline at end of file diff --git a/lm_eval/tasks/catalan_bench/paws_ca.yaml b/lm_eval/tasks/catalan_bench/paws_ca.yaml new file mode 100644 index 0000000000..c9fbd04f9a --- /dev/null +++ b/lm_eval/tasks/catalan_bench/paws_ca.yaml @@ -0,0 +1,20 @@ +group: + - pawsx +task: paws_ca +dataset_path: projecte-aina/PAWS-ca +dataset_name: null +output_type: multiple_choice +training_split: train +validation_split: validation +test_split: test +process_docs: !function utils.process_docs_paraphrases +doc_to_text: '' +doc_to_target: label +doc_to_choice: '{{[sentence1+", veritat? No, "+sentence2, sentence1+", veritat? Sí, "+sentence2]}}' +target_delimiter: '' +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 \ No newline at end of file diff --git a/lm_eval/tasks/catalan_bench/phrases_va/_phrases_va_common.yaml b/lm_eval/tasks/catalan_bench/phrases_va/_phrases_va_common.yaml new file mode 100644 index 0000000000..48232725b2 --- /dev/null +++ b/lm_eval/tasks/catalan_bench/phrases_va/_phrases_va_common.yaml @@ -0,0 +1,24 @@ +group: phrases_va +dataset_path: gplsi/CA-VA_alignment_test +output_type: generate_until +training_split: null +validation_split: null +test_split: test +fewshot_split: test +num_fewshot: 5 +target_delimiter: ' ' +generation_kwargs: + until: + - "\n" +metric_list: + - metric: bleu + aggregation: bleu + higher_is_better: true + - metric: ter + aggregation: ter + higher_is_better: false + - metric: chrf + aggregation: chrf + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/catalan_bench/phrases_va/phrases_ca-va.yaml b/lm_eval/tasks/catalan_bench/phrases_va/phrases_ca-va.yaml new file mode 100644 index 0000000000..fc0e08d5a2 --- /dev/null +++ b/lm_eval/tasks/catalan_bench/phrases_va/phrases_ca-va.yaml @@ -0,0 +1,7 @@ +# File generated by `create-yamls.py` +include: _phrases_va_common.yaml +task: phrases_ca-va +doc_to_text: 'Oració en català: {{ca}} + + Oració en valencià:' +doc_to_target: '{{va}}' diff --git a/lm_eval/tasks/catalan_bench/phrases_va/phrases_va-ca.yaml b/lm_eval/tasks/catalan_bench/phrases_va/phrases_va-ca.yaml new file mode 100644 index 0000000000..5b1a76780a --- /dev/null +++ b/lm_eval/tasks/catalan_bench/phrases_va/phrases_va-ca.yaml @@ -0,0 +1,7 @@ +# File generated by `create-yamls.py` +include: _phrases_va_common.yaml +task: phrases_va-ca +doc_to_text: 'Oració en valencià: {{va}} + + Oració en català:' +doc_to_target: '{{ca}}' diff --git a/lm_eval/tasks/catalan_bench/piqa_ca.yaml b/lm_eval/tasks/catalan_bench/piqa_ca.yaml new file mode 100644 index 0000000000..11e600a7f1 --- /dev/null +++ b/lm_eval/tasks/catalan_bench/piqa_ca.yaml @@ -0,0 +1,21 @@ +task: piqa_ca +dataset_path: projecte-aina/piqa_ca +dataset_name: null +output_type: multiple_choice +training_split: null +validation_split: validation +test_split: null +doc_to_text: "Pregunta: {{goal}}\nResposta:" +doc_to_target: label +doc_to_choice: "{{[sol1, sol2]}}" +should_decontaminate: true +doc_to_decontamination_query: goal +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/catalan_bench/siqa_ca.yaml b/lm_eval/tasks/catalan_bench/siqa_ca.yaml new file mode 100644 index 0000000000..01f0651b7c --- /dev/null +++ b/lm_eval/tasks/catalan_bench/siqa_ca.yaml @@ -0,0 +1,16 @@ +task: siqa_ca +dataset_path: projecte-aina/siqa_ca +output_type: multiple_choice +training_split: null +validation_split: validation +test_split: null +doc_to_text: "Pregunta: {{context}} {{question}}\nResposta:" +target_delimiter: " " +doc_to_choice: "{{[answerA, answerB, answerC]}}" +doc_to_target: "{{ (label|int) - 1 }}" +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 \ No newline at end of file diff --git a/lm_eval/tasks/catalan_bench/teca.yaml b/lm_eval/tasks/catalan_bench/teca.yaml new file mode 100644 index 0000000000..c60acbd559 --- /dev/null +++ b/lm_eval/tasks/catalan_bench/teca.yaml @@ -0,0 +1,18 @@ +task: teca +dataset_path: projecte-aina/teca +dataset_name: null +training_split: train +validation_split: validation +test_split: test +output_type: multiple_choice +process_docs: !function utils.process_doc_nli +doc_to_text: "" +doc_to_target: label +target_delimiter: "" +doc_to_choice: '{{[premise + ", correcte? Sí, " + hypothesis, premise + ", correcte? A més, " + hypothesis, premise + ", correcte? No, " + hypothesis]}}' +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 \ No newline at end of file diff --git a/lm_eval/tasks/catalan_bench/utils.py b/lm_eval/tasks/catalan_bench/utils.py new file mode 100644 index 0000000000..650749dc59 --- /dev/null +++ b/lm_eval/tasks/catalan_bench/utils.py @@ -0,0 +1,122 @@ +import re +from itertools import product +import evaluate +import transformers.data.metrics.squad_metrics as squad_metrics +from lm_eval.utils import general_detokenize + + +def lowercase_first_letter(text): + return text[0].lower() + text[1:] + +def process_doc_nli(dataset): + def process_fn(doc): + # Detokenize(remove extra whitespaces) + doc["premise"] = general_detokenize(doc["premise"]).strip() + doc["hypothesis"] = general_detokenize(doc["hypothesis"]).strip() + # Remove last punctuation mark in the premise + doc["premise"] = doc["premise"][:-1] if doc["premise"].endswith((".", ",", "!", "?")) else doc["premise"] + # Lowercase the first letter in the hypothesis + doc["hypothesis"] = lowercase_first_letter(doc["hypothesis"]) + # Ensure that the hypothesis ends with a dot + doc["hypothesis"] = (doc["hypothesis"] + ".") if not doc["hypothesis"].endswith(".") else doc["hypothesis"] + return doc + return dataset.map(process_fn) + + +def process_results_coqcat(doc, results): + # Get all possible answers and compute the scores + turn_id = len(doc["questions"]) + answers = [doc["answers"]["input_text"][turn_id - 1]] + additional_answers_list = doc.get("additional_answers") + if additional_answers_list: + for key, additional_answers in additional_answers_list.items(): + if additional_answers["input_text"][turn_id - 1].lower() not in map(str.lower, answers): + answers.append(additional_answers["input_text"][turn_id - 1]) + + gold_list = answers + pred = results[0].strip().split("\n")[0] + #import code; code.interact(local=dict(globals(), **locals())) + + f1_sum = 0.0 + em_sum = 0.0 + if len(gold_list) > 1: + for i in range(len(gold_list)): + gold_answers = gold_list[0:i] + gold_list[i + 1:] + # predictions compared against (n) golds and take maximum + em_sum += max(squad_metrics.compute_exact(a, pred) for a in gold_answers) + f1_sum += max(squad_metrics.compute_f1(a, pred) for a in gold_answers) + else: + em_sum += max(squad_metrics.compute_exact(a, pred) for a in gold_list) + f1_sum += max(squad_metrics.compute_f1(a, pred) for a in gold_list) + #import code; code.interact(local=dict(globals(), **locals())) + return { + "em": em_sum / max(1, len(gold_list)), + "f1": f1_sum / max(1, len(gold_list)), + } + +def process_results_qa(doc, results): + preds = results[0] + reference = doc["answers"][0]["text"] + #import code; code.interact(local=dict(globals(), **locals())) + f1_sum = squad_metrics.compute_f1(reference, preds) + exact_match = squad_metrics.compute_exact(reference, preds) + return { + "f1": f1_sum, + "exact_match": exact_match + } + +def process_doc_cabreu(dataset): + def process_fn(doc): + # Remove duplicate spaces + doc["content"] = re.sub(r" +", " ", doc["content"]) + for summary_type, index in product(["abstractive", "extractive", "extreme"], ["a1", "a2", "a3"]): + doc["summaries"][summary_type][index] = re.sub(r" +", " ", doc["summaries"][summary_type][index]) + return doc + + return dataset.map(process_fn) + +def process_docs_paraphrases(dataset): + empty_docs = [] + def _process_doc(doc): + if doc["sentence1"] not in [None, ""] and doc["sentence2"] not in [None, ""]: + doc["sentence1"] = general_detokenize(doc["sentence1"]).strip() + doc["sentence2"] = general_detokenize(doc["sentence2"]).strip() + # Remove final punctuation mark in the first sentence + if doc["sentence1"].endswith((".", ",", ";")): + doc["sentence1"] = doc["sentence1"][:-1] + # Start the second sentence in lowercase (to be used after "Yes, ...") + doc["sentence2"] = lowercase_first_letter(doc["sentence2"]) + return doc + else: + empty_docs.append(doc) + return doc + if empty_docs != []: + len_empty_docs = len(empty_docs) + print(f"Found {len_empty_docs} empty documents out of the {len(dataset)} total docs in the dataset: {empty_docs}") + return dataset.filter(lambda doc: doc["sentence1"] not in [None, ""] and doc["sentence2"] not in [None, ""]).map(_process_doc) + + +def process_docs_copa_ca(dataset): + def _process_doc(doc): + doc["choice1"] = lowercase_first_letter(doc["choice1"]) + doc["choice2"] = lowercase_first_letter(doc["choice2"]) + return doc + return dataset.map(_process_doc) + + +def rouge1(items): + """ + # passthrough for efficiency + """ + return items + + +def rouge1_agg(items): + """ + Higher is better + """ + refs = list(zip(*items))[0] + preds = list(zip(*items))[1] + rouge_scorer = evaluate.load("rouge") + return rouge_scorer.compute(predictions=preds, references=refs)["rouge1"] + diff --git a/lm_eval/tasks/catalan_bench/wnli_ca.yaml b/lm_eval/tasks/catalan_bench/wnli_ca.yaml new file mode 100644 index 0000000000..ba9d8b1e8a --- /dev/null +++ b/lm_eval/tasks/catalan_bench/wnli_ca.yaml @@ -0,0 +1,14 @@ +task: wnli_ca +dataset_path: projecte-aina/wnli-ca +dataset_name: null +output_type: multiple_choice +training_split: train +validation_split: validation +test_split: null +doc_to_text: "{{sentence1}}\nPregunta: {{sentence2}} Cert o Fals?\nResposta:" +doc_to_target: label +doc_to_choice: ["Fals", "Cert"] +metric_list: + - metric: acc +metadata: + version: 1.0 \ No newline at end of file diff --git a/lm_eval/tasks/catalan_bench/xnli_ca.yaml b/lm_eval/tasks/catalan_bench/xnli_ca.yaml new file mode 100644 index 0000000000..959b4775d1 --- /dev/null +++ b/lm_eval/tasks/catalan_bench/xnli_ca.yaml @@ -0,0 +1,21 @@ +group: + - xnli +task: xnli_ca +dataset_path: projecte-aina/xnli-ca +dataset_name: null +include: ../xnli/xnli_common_yaml +output_type: multiple_choice +doc_to_choice: '{{[premise+", correcte? Sí, "+hypothesis,premise+", correcte? A més, + "+hypothesis,premise+", correcte? No, "+hypothesis]}}' +doc_to_text: '' +target_delimiter: '' +process_docs: !function utils.process_doc_nli +training_split: null +validation_split: validation +doc_to_target: label +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 \ No newline at end of file diff --git a/lm_eval/tasks/catalan_bench/xquad_ca.yaml b/lm_eval/tasks/catalan_bench/xquad_ca.yaml new file mode 100644 index 0000000000..e70a59a978 --- /dev/null +++ b/lm_eval/tasks/catalan_bench/xquad_ca.yaml @@ -0,0 +1,25 @@ +task: xquad_ca +dataset_path: projecte-aina/xquad-ca +dataset_name: null +output_type: generate_until +doc_to_text: "Context: {{context}}\n\nPregunta: {{question}}\n\nResposta:" +doc_to_target: '{{answers[0]["text"]}}' +validation_split: null +test_split: test +target_delimiter: ' ' +process_results: !function utils.process_results_qa +test_split: test +generation_kwargs: + until: + - "\n" + do_sample: false + temperature: 0.0 +metric_list: + - metric: exact_match + aggregation: mean + higher_is_better: true + - metric: f1 + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 \ No newline at end of file diff --git a/lm_eval/tasks/catalan_bench/xstorycloze_ca.yaml b/lm_eval/tasks/catalan_bench/xstorycloze_ca.yaml new file mode 100644 index 0000000000..11491d9267 --- /dev/null +++ b/lm_eval/tasks/catalan_bench/xstorycloze_ca.yaml @@ -0,0 +1,17 @@ +task: xstorycloze_ca +dataset_path: projecte-aina/xstorycloze_ca +dataset_name: ca +output_type: multiple_choice +training_split: train +validation_split: eval +doc_to_text: "{{[input_sentence_1, input_sentence_2, input_sentence_3, input_sentence_4]|join(' ')}}" +doc_to_target: "{{answer_right_ending-1}}" +doc_to_choice: "{{[sentence_quiz1, sentence_quiz2]}}" +should_decontaminate: true +doc_to_decontamination_query: "{{[input_sentence_1, input_sentence_2, input_sentence_3, input_sentence_4]|join(' ')}}" +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 \ No newline at end of file From 19fc11308db1f465e7886e1c423cf5bde828ad5f Mon Sep 17 00:00:00 2001 From: zxcvuser Date: Mon, 29 Jul 2024 17:37:50 +0200 Subject: [PATCH 2/4] added flores_ca.yaml --- .../flores_ca/_flores_common_yaml | 2 ++ .../flores_ca/create-yamls_flores_ca.py | 2 +- .../catalan_bench/flores_ca/flores_ca-de.yaml | 1 - .../catalan_bench/flores_ca/flores_ca-en.yaml | 1 - .../catalan_bench/flores_ca/flores_ca-es.yaml | 1 - .../catalan_bench/flores_ca/flores_ca-eu.yaml | 1 - .../catalan_bench/flores_ca/flores_ca-fr.yaml | 1 - .../catalan_bench/flores_ca/flores_ca-gl.yaml | 1 - .../catalan_bench/flores_ca/flores_ca-it.yaml | 1 - .../catalan_bench/flores_ca/flores_ca-pt.yaml | 1 - .../catalan_bench/flores_ca/flores_ca.yaml | 23 +++++++++++++++++++ .../catalan_bench/flores_ca/flores_de-ca.yaml | 1 - .../catalan_bench/flores_ca/flores_en-ca.yaml | 1 - .../catalan_bench/flores_ca/flores_es-ca.yaml | 1 - .../catalan_bench/flores_ca/flores_eu-ca.yaml | 1 - .../catalan_bench/flores_ca/flores_fr-ca.yaml | 1 - .../catalan_bench/flores_ca/flores_gl-ca.yaml | 1 - .../catalan_bench/flores_ca/flores_it-ca.yaml | 1 - .../catalan_bench/flores_ca/flores_pt-ca.yaml | 1 - 19 files changed, 26 insertions(+), 17 deletions(-) create mode 100644 lm_eval/tasks/catalan_bench/flores_ca/flores_ca.yaml diff --git a/lm_eval/tasks/catalan_bench/flores_ca/_flores_common_yaml b/lm_eval/tasks/catalan_bench/flores_ca/_flores_common_yaml index 075a66b5f4..59a9b14aaf 100644 --- a/lm_eval/tasks/catalan_bench/flores_ca/_flores_common_yaml +++ b/lm_eval/tasks/catalan_bench/flores_ca/_flores_common_yaml @@ -21,3 +21,5 @@ metric_list: higher_is_better: true metadata: version: 1.0 +dataset_kwargs: + trust_remote_code: true diff --git a/lm_eval/tasks/catalan_bench/flores_ca/create-yamls_flores_ca.py b/lm_eval/tasks/catalan_bench/flores_ca/create-yamls_flores_ca.py index eeadc3ed76..b83295f445 100644 --- a/lm_eval/tasks/catalan_bench/flores_ca/create-yamls_flores_ca.py +++ b/lm_eval/tasks/catalan_bench/flores_ca/create-yamls_flores_ca.py @@ -82,7 +82,7 @@ def gen_lang_yamls(output_dir: str, overwrite: bool) -> None: yaml.dump( { # "group": [f"{BENCH_NAME}_bench", f"{BENCH_NAME}_bench_flores"], - "group": "flores_ca", +# "group": "flores_ca", "include": "_flores_common_yaml", "task": f"flores_{lang_pair_name}", "doc_to_text": doc_to_text(src, tgt), diff --git a/lm_eval/tasks/catalan_bench/flores_ca/flores_ca-de.yaml b/lm_eval/tasks/catalan_bench/flores_ca/flores_ca-de.yaml index 0bb415a04b..15eb02afb6 100644 --- a/lm_eval/tasks/catalan_bench/flores_ca/flores_ca-de.yaml +++ b/lm_eval/tasks/catalan_bench/flores_ca/flores_ca-de.yaml @@ -1,5 +1,4 @@ # File generated by `create-yamls.py` -group: flores_ca include: _flores_common_yaml task: flores_ca-de doc_to_text: 'Catalan sentence: {{sentence_cat_Latn}} diff --git a/lm_eval/tasks/catalan_bench/flores_ca/flores_ca-en.yaml b/lm_eval/tasks/catalan_bench/flores_ca/flores_ca-en.yaml index 3aa1351307..9a8f5ffeb8 100644 --- a/lm_eval/tasks/catalan_bench/flores_ca/flores_ca-en.yaml +++ b/lm_eval/tasks/catalan_bench/flores_ca/flores_ca-en.yaml @@ -1,5 +1,4 @@ # File generated by `create-yamls.py` -group: flores_ca include: _flores_common_yaml task: flores_ca-en doc_to_text: 'Catalan sentence: {{sentence_cat_Latn}} diff --git a/lm_eval/tasks/catalan_bench/flores_ca/flores_ca-es.yaml b/lm_eval/tasks/catalan_bench/flores_ca/flores_ca-es.yaml index 1b0cf7c7f0..9a6aa44240 100644 --- a/lm_eval/tasks/catalan_bench/flores_ca/flores_ca-es.yaml +++ b/lm_eval/tasks/catalan_bench/flores_ca/flores_ca-es.yaml @@ -1,5 +1,4 @@ # File generated by `create-yamls.py` -group: flores_ca include: _flores_common_yaml task: flores_ca-es doc_to_text: 'Catalan sentence: {{sentence_cat_Latn}} diff --git a/lm_eval/tasks/catalan_bench/flores_ca/flores_ca-eu.yaml b/lm_eval/tasks/catalan_bench/flores_ca/flores_ca-eu.yaml index fd540c6425..48ffe7bf5c 100644 --- a/lm_eval/tasks/catalan_bench/flores_ca/flores_ca-eu.yaml +++ b/lm_eval/tasks/catalan_bench/flores_ca/flores_ca-eu.yaml @@ -1,5 +1,4 @@ # File generated by `create-yamls.py` -group: flores_ca include: _flores_common_yaml task: flores_ca-eu doc_to_text: 'Catalan sentence: {{sentence_cat_Latn}} diff --git a/lm_eval/tasks/catalan_bench/flores_ca/flores_ca-fr.yaml b/lm_eval/tasks/catalan_bench/flores_ca/flores_ca-fr.yaml index 5aa495781d..99b40c1462 100644 --- a/lm_eval/tasks/catalan_bench/flores_ca/flores_ca-fr.yaml +++ b/lm_eval/tasks/catalan_bench/flores_ca/flores_ca-fr.yaml @@ -1,5 +1,4 @@ # File generated by `create-yamls.py` -group: flores_ca include: _flores_common_yaml task: flores_ca-fr doc_to_text: 'Catalan sentence: {{sentence_cat_Latn}} diff --git a/lm_eval/tasks/catalan_bench/flores_ca/flores_ca-gl.yaml b/lm_eval/tasks/catalan_bench/flores_ca/flores_ca-gl.yaml index d33cdd505e..5da7ad5fe4 100644 --- a/lm_eval/tasks/catalan_bench/flores_ca/flores_ca-gl.yaml +++ b/lm_eval/tasks/catalan_bench/flores_ca/flores_ca-gl.yaml @@ -1,5 +1,4 @@ # File generated by `create-yamls.py` -group: flores_ca include: _flores_common_yaml task: flores_ca-gl doc_to_text: 'Catalan sentence: {{sentence_cat_Latn}} diff --git a/lm_eval/tasks/catalan_bench/flores_ca/flores_ca-it.yaml b/lm_eval/tasks/catalan_bench/flores_ca/flores_ca-it.yaml index 61431e8da2..20f8d99f9f 100644 --- a/lm_eval/tasks/catalan_bench/flores_ca/flores_ca-it.yaml +++ b/lm_eval/tasks/catalan_bench/flores_ca/flores_ca-it.yaml @@ -1,5 +1,4 @@ # File generated by `create-yamls.py` -group: flores_ca include: _flores_common_yaml task: flores_ca-it doc_to_text: 'Catalan sentence: {{sentence_cat_Latn}} diff --git a/lm_eval/tasks/catalan_bench/flores_ca/flores_ca-pt.yaml b/lm_eval/tasks/catalan_bench/flores_ca/flores_ca-pt.yaml index 3b0e70b550..565f6267c5 100644 --- a/lm_eval/tasks/catalan_bench/flores_ca/flores_ca-pt.yaml +++ b/lm_eval/tasks/catalan_bench/flores_ca/flores_ca-pt.yaml @@ -1,5 +1,4 @@ # File generated by `create-yamls.py` -group: flores_ca include: _flores_common_yaml task: flores_ca-pt doc_to_text: 'Catalan sentence: {{sentence_cat_Latn}} diff --git a/lm_eval/tasks/catalan_bench/flores_ca/flores_ca.yaml b/lm_eval/tasks/catalan_bench/flores_ca/flores_ca.yaml new file mode 100644 index 0000000000..9bc682eb5c --- /dev/null +++ b/lm_eval/tasks/catalan_bench/flores_ca/flores_ca.yaml @@ -0,0 +1,23 @@ +group: flores_ca +task: + - flores_es-ca + - flores_ca-es + - flores_en-ca + - flores_ca-en + - flores_eu-ca + - flores_ca-eu + - flores_pt-ca + - flores_ca-pt + - flores_it-ca + - flores_ca-it + - flores_fr-ca + - flores_ca-fr + - flores_ca-gl + - flores_gl-ca + - flores_ca-de + - flores_de-ca +aggregate_metric_list: + - metric: bleu + aggregation: mean +metadata: + version: 1.0 diff --git a/lm_eval/tasks/catalan_bench/flores_ca/flores_de-ca.yaml b/lm_eval/tasks/catalan_bench/flores_ca/flores_de-ca.yaml index 363bd62ced..af3d0eb493 100644 --- a/lm_eval/tasks/catalan_bench/flores_ca/flores_de-ca.yaml +++ b/lm_eval/tasks/catalan_bench/flores_ca/flores_de-ca.yaml @@ -1,5 +1,4 @@ # File generated by `create-yamls.py` -group: flores_ca include: _flores_common_yaml task: flores_de-ca doc_to_text: 'German sentence: {{sentence_deu_Latn}} diff --git a/lm_eval/tasks/catalan_bench/flores_ca/flores_en-ca.yaml b/lm_eval/tasks/catalan_bench/flores_ca/flores_en-ca.yaml index 81706f6dc2..16132ff497 100644 --- a/lm_eval/tasks/catalan_bench/flores_ca/flores_en-ca.yaml +++ b/lm_eval/tasks/catalan_bench/flores_ca/flores_en-ca.yaml @@ -1,5 +1,4 @@ # File generated by `create-yamls.py` -group: flores_ca include: _flores_common_yaml task: flores_en-ca doc_to_text: 'English sentence: {{sentence_eng_Latn}} diff --git a/lm_eval/tasks/catalan_bench/flores_ca/flores_es-ca.yaml b/lm_eval/tasks/catalan_bench/flores_ca/flores_es-ca.yaml index a74437e392..e35b715213 100644 --- a/lm_eval/tasks/catalan_bench/flores_ca/flores_es-ca.yaml +++ b/lm_eval/tasks/catalan_bench/flores_ca/flores_es-ca.yaml @@ -1,5 +1,4 @@ # File generated by `create-yamls.py` -group: flores_ca include: _flores_common_yaml task: flores_es-ca doc_to_text: 'Spanish sentence: {{sentence_spa_Latn}} diff --git a/lm_eval/tasks/catalan_bench/flores_ca/flores_eu-ca.yaml b/lm_eval/tasks/catalan_bench/flores_ca/flores_eu-ca.yaml index dcf6e3760f..c8be6ee93b 100644 --- a/lm_eval/tasks/catalan_bench/flores_ca/flores_eu-ca.yaml +++ b/lm_eval/tasks/catalan_bench/flores_ca/flores_eu-ca.yaml @@ -1,5 +1,4 @@ # File generated by `create-yamls.py` -group: flores_ca include: _flores_common_yaml task: flores_eu-ca doc_to_text: 'Basque sentence: {{sentence_eus_Latn}} diff --git a/lm_eval/tasks/catalan_bench/flores_ca/flores_fr-ca.yaml b/lm_eval/tasks/catalan_bench/flores_ca/flores_fr-ca.yaml index a5abc7ead8..0d2de77edf 100644 --- a/lm_eval/tasks/catalan_bench/flores_ca/flores_fr-ca.yaml +++ b/lm_eval/tasks/catalan_bench/flores_ca/flores_fr-ca.yaml @@ -1,5 +1,4 @@ # File generated by `create-yamls.py` -group: flores_ca include: _flores_common_yaml task: flores_fr-ca doc_to_text: 'French sentence: {{sentence_fra_Latn}} diff --git a/lm_eval/tasks/catalan_bench/flores_ca/flores_gl-ca.yaml b/lm_eval/tasks/catalan_bench/flores_ca/flores_gl-ca.yaml index 78c554086f..6ce3eaae5c 100644 --- a/lm_eval/tasks/catalan_bench/flores_ca/flores_gl-ca.yaml +++ b/lm_eval/tasks/catalan_bench/flores_ca/flores_gl-ca.yaml @@ -1,5 +1,4 @@ # File generated by `create-yamls.py` -group: flores_ca include: _flores_common_yaml task: flores_gl-ca doc_to_text: 'Galician sentence: {{sentence_glg_Latn}} diff --git a/lm_eval/tasks/catalan_bench/flores_ca/flores_it-ca.yaml b/lm_eval/tasks/catalan_bench/flores_ca/flores_it-ca.yaml index 128834d976..db811154e5 100644 --- a/lm_eval/tasks/catalan_bench/flores_ca/flores_it-ca.yaml +++ b/lm_eval/tasks/catalan_bench/flores_ca/flores_it-ca.yaml @@ -1,5 +1,4 @@ # File generated by `create-yamls.py` -group: flores_ca include: _flores_common_yaml task: flores_it-ca doc_to_text: 'Italian sentence: {{sentence_ita_Latn}} diff --git a/lm_eval/tasks/catalan_bench/flores_ca/flores_pt-ca.yaml b/lm_eval/tasks/catalan_bench/flores_ca/flores_pt-ca.yaml index dd355b797c..196295c9e3 100644 --- a/lm_eval/tasks/catalan_bench/flores_ca/flores_pt-ca.yaml +++ b/lm_eval/tasks/catalan_bench/flores_ca/flores_pt-ca.yaml @@ -1,5 +1,4 @@ # File generated by `create-yamls.py` -group: flores_ca include: _flores_common_yaml task: flores_pt-ca doc_to_text: 'Portuguese sentence: {{sentence_por_Latn}} From fb8f0c1b0f3de0faf3bc8602b69c4c23a6f19b20 Mon Sep 17 00:00:00 2001 From: zxcvuser Date: Fri, 27 Sep 2024 18:08:33 +0200 Subject: [PATCH 3/4] Updated some task groupings and readme --- lm_eval/tasks/README.md | 1 + lm_eval/tasks/catalan_bench/README.md | 8 +-- .../tasks/catalan_bench/_arc_ca_common_yaml | 3 +- .../tasks/catalan_bench/_cabreu_common_yaml | 2 +- lm_eval/tasks/catalan_bench/catalanqa.yaml | 2 +- lm_eval/tasks/catalan_bench/copa_ca.yaml | 2 +- lm_eval/tasks/catalan_bench/coqcat.yaml | 2 +- ...flores_ca.py => create_yamls_flores_ca.py} | 0 .../catalan_bench/flores_ca/flores_ca.yaml | 1 + .../tasks/catalan_bench/mgsm_direct_ca.yaml | 2 - .../tasks/catalan_bench/openbookqa_ca.yaml | 2 +- lm_eval/tasks/catalan_bench/parafraseja.yaml | 2 +- lm_eval/tasks/catalan_bench/paws_ca.yaml | 4 +- .../phrases_va/_phrases_va_common.yaml | 2 +- lm_eval/tasks/catalan_bench/siqa_ca.yaml | 2 +- lm_eval/tasks/catalan_bench/teca.yaml | 2 +- lm_eval/tasks/catalan_bench/utils.py | 66 ++++++++++++------- lm_eval/tasks/catalan_bench/wnli_ca.yaml | 2 +- lm_eval/tasks/catalan_bench/xnli_ca.yaml | 4 +- lm_eval/tasks/catalan_bench/xquad_ca.yaml | 3 +- .../tasks/catalan_bench/xstorycloze_ca.yaml | 2 +- 21 files changed, 63 insertions(+), 51 deletions(-) rename lm_eval/tasks/catalan_bench/flores_ca/{create-yamls_flores_ca.py => create_yamls_flores_ca.py} (100%) diff --git a/lm_eval/tasks/README.md b/lm_eval/tasks/README.md index 3903db98e8..de68b320bb 100644 --- a/lm_eval/tasks/README.md +++ b/lm_eval/tasks/README.md @@ -121,3 +121,4 @@ | [xnli_eu](xnli_eu/README.md) | Cross-lingual Natural Language Inference tasks in Basque. | Basque | | [xstorycloze](xstorycloze/README.md) | Cross-lingual narrative understanding tasks to predict story endings in multiple languages. | Russian, Simplified Chinese, Spanish, Arabic, Hindi, Indonesian, Telugu, Swahili, Basque, Burmese | | [xwinograd](xwinograd/README.md) | Cross-lingual Winograd schema tasks for coreference resolution in multiple languages. | English, French, Japanese, Portuguese, Russian, Chinese | +| [catalan_bench](catalan_bench/README.md) | Collection of tasks in Catalan encompassing various evaluation areas. | Catalan | diff --git a/lm_eval/tasks/catalan_bench/README.md b/lm_eval/tasks/catalan_bench/README.md index 32b1b0fc34..73dec948fe 100644 --- a/lm_eval/tasks/catalan_bench/README.md +++ b/lm_eval/tasks/catalan_bench/README.md @@ -54,10 +54,11 @@ Paper for CatalanBench coming soon. #### Groups - `catalan_bench`: All tasks included in CatalanBench. -- `cabreu`: Three CaBREU tasks for each type of summary (extractive, abstractive and extreme). - `flores_ca`: All FLORES translation tasks from or to Catalan. -- `phrases_ca`: Two Phrases_va tasks for language adaptation between Catalan and Valencian. +#### Tags +- `cabreu`: Three CaBREU tasks for each type of summary (extractive, abstractive and extreme). +- `phrases_va`: Two Phrases_va tasks for language adaptation between Catalan and Valencian. #### Tasks @@ -105,9 +106,6 @@ The following tasks evaluate tasks on CatalanBench dataset using various scoring Some of these tasks are taken from benchmarks already available in LM Evaluation Harness. These are: - `belebele_cat_Latn`: Belebele Catalan -- `veritasqa_gen_ca`: VeritasQA Catalan -- `veritasqa_mc1_ca`: VeritasQA Catalan -- `veritasqa_mc2_ca`: VeritasQA Catalan ### Checklist diff --git a/lm_eval/tasks/catalan_bench/_arc_ca_common_yaml b/lm_eval/tasks/catalan_bench/_arc_ca_common_yaml index 38dfc08b46..b89290ebaf 100644 --- a/lm_eval/tasks/catalan_bench/_arc_ca_common_yaml +++ b/lm_eval/tasks/catalan_bench/_arc_ca_common_yaml @@ -1,5 +1,4 @@ -group: - - ai2_arc +tag: arc_ca dataset_path: projecte-aina/arc_ca output_type: multiple_choice training_split: null diff --git a/lm_eval/tasks/catalan_bench/_cabreu_common_yaml b/lm_eval/tasks/catalan_bench/_cabreu_common_yaml index 85c51a18b6..c66e8bc486 100644 --- a/lm_eval/tasks/catalan_bench/_cabreu_common_yaml +++ b/lm_eval/tasks/catalan_bench/_cabreu_common_yaml @@ -1,4 +1,4 @@ -group: cabreu +tag: cabreu dataset_path: projecte-aina/caBreu dataset_name: null output_type: generate_until diff --git a/lm_eval/tasks/catalan_bench/catalanqa.yaml b/lm_eval/tasks/catalan_bench/catalanqa.yaml index 8861794e94..926cdfa1be 100644 --- a/lm_eval/tasks/catalan_bench/catalanqa.yaml +++ b/lm_eval/tasks/catalan_bench/catalanqa.yaml @@ -22,4 +22,4 @@ metric_list: aggregation: mean higher_is_better: true metadata: - version: 1.0 \ No newline at end of file + version: 1.0 diff --git a/lm_eval/tasks/catalan_bench/copa_ca.yaml b/lm_eval/tasks/catalan_bench/copa_ca.yaml index c7ca3f11a1..d376ad3aea 100644 --- a/lm_eval/tasks/catalan_bench/copa_ca.yaml +++ b/lm_eval/tasks/catalan_bench/copa_ca.yaml @@ -14,4 +14,4 @@ metric_list: aggregation: mean higher_is_better: true metadata: - version: 1.0 \ No newline at end of file + version: 1.0 diff --git a/lm_eval/tasks/catalan_bench/coqcat.yaml b/lm_eval/tasks/catalan_bench/coqcat.yaml index 12ab7d7cdf..95145a7492 100644 --- a/lm_eval/tasks/catalan_bench/coqcat.yaml +++ b/lm_eval/tasks/catalan_bench/coqcat.yaml @@ -20,4 +20,4 @@ metric_list: aggregation: mean higher_is_better: true metadata: - version: 1.0 \ No newline at end of file + version: 1.0 diff --git a/lm_eval/tasks/catalan_bench/flores_ca/create-yamls_flores_ca.py b/lm_eval/tasks/catalan_bench/flores_ca/create_yamls_flores_ca.py similarity index 100% rename from lm_eval/tasks/catalan_bench/flores_ca/create-yamls_flores_ca.py rename to lm_eval/tasks/catalan_bench/flores_ca/create_yamls_flores_ca.py diff --git a/lm_eval/tasks/catalan_bench/flores_ca/flores_ca.yaml b/lm_eval/tasks/catalan_bench/flores_ca/flores_ca.yaml index 9bc682eb5c..4726daa83e 100644 --- a/lm_eval/tasks/catalan_bench/flores_ca/flores_ca.yaml +++ b/lm_eval/tasks/catalan_bench/flores_ca/flores_ca.yaml @@ -19,5 +19,6 @@ task: aggregate_metric_list: - metric: bleu aggregation: mean + weight_by_size: false metadata: version: 1.0 diff --git a/lm_eval/tasks/catalan_bench/mgsm_direct_ca.yaml b/lm_eval/tasks/catalan_bench/mgsm_direct_ca.yaml index 754d5b91ce..066336a67f 100644 --- a/lm_eval/tasks/catalan_bench/mgsm_direct_ca.yaml +++ b/lm_eval/tasks/catalan_bench/mgsm_direct_ca.yaml @@ -1,5 +1,3 @@ -group: - - mgsm_direct task: mgsm_direct_ca dataset_path: projecte-aina/mgsm_ca doc_to_target: '{{answer_number|string}}' diff --git a/lm_eval/tasks/catalan_bench/openbookqa_ca.yaml b/lm_eval/tasks/catalan_bench/openbookqa_ca.yaml index 00a1f03d4f..868be75612 100644 --- a/lm_eval/tasks/catalan_bench/openbookqa_ca.yaml +++ b/lm_eval/tasks/catalan_bench/openbookqa_ca.yaml @@ -17,4 +17,4 @@ metric_list: aggregation: mean higher_is_better: true metadata: - version: 1.0 \ No newline at end of file + version: 1.0 diff --git a/lm_eval/tasks/catalan_bench/parafraseja.yaml b/lm_eval/tasks/catalan_bench/parafraseja.yaml index 208e3e373f..060d488d18 100644 --- a/lm_eval/tasks/catalan_bench/parafraseja.yaml +++ b/lm_eval/tasks/catalan_bench/parafraseja.yaml @@ -14,4 +14,4 @@ metric_list: aggregation: mean higher_is_better: true metadata: - version: 1.0 \ No newline at end of file + version: 1.0 diff --git a/lm_eval/tasks/catalan_bench/paws_ca.yaml b/lm_eval/tasks/catalan_bench/paws_ca.yaml index c9fbd04f9a..e736f5c746 100644 --- a/lm_eval/tasks/catalan_bench/paws_ca.yaml +++ b/lm_eval/tasks/catalan_bench/paws_ca.yaml @@ -1,5 +1,3 @@ -group: - - pawsx task: paws_ca dataset_path: projecte-aina/PAWS-ca dataset_name: null @@ -17,4 +15,4 @@ metric_list: aggregation: mean higher_is_better: true metadata: - version: 1.0 \ No newline at end of file + version: 1.0 diff --git a/lm_eval/tasks/catalan_bench/phrases_va/_phrases_va_common.yaml b/lm_eval/tasks/catalan_bench/phrases_va/_phrases_va_common.yaml index 48232725b2..f59a2098ca 100644 --- a/lm_eval/tasks/catalan_bench/phrases_va/_phrases_va_common.yaml +++ b/lm_eval/tasks/catalan_bench/phrases_va/_phrases_va_common.yaml @@ -1,4 +1,4 @@ -group: phrases_va +tag: phrases_va dataset_path: gplsi/CA-VA_alignment_test output_type: generate_until training_split: null diff --git a/lm_eval/tasks/catalan_bench/siqa_ca.yaml b/lm_eval/tasks/catalan_bench/siqa_ca.yaml index 01f0651b7c..8a39a37f5c 100644 --- a/lm_eval/tasks/catalan_bench/siqa_ca.yaml +++ b/lm_eval/tasks/catalan_bench/siqa_ca.yaml @@ -13,4 +13,4 @@ metric_list: aggregation: mean higher_is_better: true metadata: - version: 1.0 \ No newline at end of file + version: 1.0 diff --git a/lm_eval/tasks/catalan_bench/teca.yaml b/lm_eval/tasks/catalan_bench/teca.yaml index c60acbd559..8978c2c969 100644 --- a/lm_eval/tasks/catalan_bench/teca.yaml +++ b/lm_eval/tasks/catalan_bench/teca.yaml @@ -15,4 +15,4 @@ metric_list: aggregation: mean higher_is_better: true metadata: - version: 1.0 \ No newline at end of file + version: 1.0 diff --git a/lm_eval/tasks/catalan_bench/utils.py b/lm_eval/tasks/catalan_bench/utils.py index 650749dc59..ced91772ca 100644 --- a/lm_eval/tasks/catalan_bench/utils.py +++ b/lm_eval/tasks/catalan_bench/utils.py @@ -1,25 +1,37 @@ import re from itertools import product + import evaluate import transformers.data.metrics.squad_metrics as squad_metrics + from lm_eval.utils import general_detokenize def lowercase_first_letter(text): - return text[0].lower() + text[1:] + return text[0].lower() + text[1:] + def process_doc_nli(dataset): def process_fn(doc): # Detokenize(remove extra whitespaces) doc["premise"] = general_detokenize(doc["premise"]).strip() doc["hypothesis"] = general_detokenize(doc["hypothesis"]).strip() - # Remove last punctuation mark in the premise - doc["premise"] = doc["premise"][:-1] if doc["premise"].endswith((".", ",", "!", "?")) else doc["premise"] - # Lowercase the first letter in the hypothesis + # Remove last punctuation mark in the premise + doc["premise"] = ( + doc["premise"][:-1] + if doc["premise"].endswith((".", ",", "!", "?")) + else doc["premise"] + ) + # Lowercase the first letter in the hypothesis doc["hypothesis"] = lowercase_first_letter(doc["hypothesis"]) - # Ensure that the hypothesis ends with a dot - doc["hypothesis"] = (doc["hypothesis"] + ".") if not doc["hypothesis"].endswith(".") else doc["hypothesis"] + # Ensure that the hypothesis ends with a dot + doc["hypothesis"] = ( + (doc["hypothesis"] + ".") + if not doc["hypothesis"].endswith(".") + else doc["hypothesis"] + ) return doc + return dataset.map(process_fn) @@ -30,53 +42,60 @@ def process_results_coqcat(doc, results): additional_answers_list = doc.get("additional_answers") if additional_answers_list: for key, additional_answers in additional_answers_list.items(): - if additional_answers["input_text"][turn_id - 1].lower() not in map(str.lower, answers): + if additional_answers["input_text"][turn_id - 1].lower() not in map( + str.lower, answers + ): answers.append(additional_answers["input_text"][turn_id - 1]) gold_list = answers pred = results[0].strip().split("\n")[0] - #import code; code.interact(local=dict(globals(), **locals())) - + # import code; code.interact(local=dict(globals(), **locals())) + f1_sum = 0.0 em_sum = 0.0 if len(gold_list) > 1: for i in range(len(gold_list)): - gold_answers = gold_list[0:i] + gold_list[i + 1:] + gold_answers = gold_list[0:i] + gold_list[i + 1 :] # predictions compared against (n) golds and take maximum em_sum += max(squad_metrics.compute_exact(a, pred) for a in gold_answers) f1_sum += max(squad_metrics.compute_f1(a, pred) for a in gold_answers) else: em_sum += max(squad_metrics.compute_exact(a, pred) for a in gold_list) f1_sum += max(squad_metrics.compute_f1(a, pred) for a in gold_list) - #import code; code.interact(local=dict(globals(), **locals())) + # import code; code.interact(local=dict(globals(), **locals())) return { "em": em_sum / max(1, len(gold_list)), "f1": f1_sum / max(1, len(gold_list)), } + def process_results_qa(doc, results): preds = results[0] reference = doc["answers"][0]["text"] - #import code; code.interact(local=dict(globals(), **locals())) + # import code; code.interact(local=dict(globals(), **locals())) f1_sum = squad_metrics.compute_f1(reference, preds) exact_match = squad_metrics.compute_exact(reference, preds) - return { - "f1": f1_sum, - "exact_match": exact_match - } + return {"f1": f1_sum, "exact_match": exact_match} + def process_doc_cabreu(dataset): def process_fn(doc): # Remove duplicate spaces doc["content"] = re.sub(r" +", " ", doc["content"]) - for summary_type, index in product(["abstractive", "extractive", "extreme"], ["a1", "a2", "a3"]): - doc["summaries"][summary_type][index] = re.sub(r" +", " ", doc["summaries"][summary_type][index]) + for summary_type, index in product( + ["abstractive", "extractive", "extreme"], ["a1", "a2", "a3"] + ): + doc["summaries"][summary_type][index] = re.sub( + r" +", " ", doc["summaries"][summary_type][index] + ) return doc return dataset.map(process_fn) + def process_docs_paraphrases(dataset): empty_docs = [] + def _process_doc(doc): if doc["sentence1"] not in [None, ""] and doc["sentence2"] not in [None, ""]: doc["sentence1"] = general_detokenize(doc["sentence1"]).strip() @@ -90,10 +109,11 @@ def _process_doc(doc): else: empty_docs.append(doc) return doc - if empty_docs != []: - len_empty_docs = len(empty_docs) - print(f"Found {len_empty_docs} empty documents out of the {len(dataset)} total docs in the dataset: {empty_docs}") - return dataset.filter(lambda doc: doc["sentence1"] not in [None, ""] and doc["sentence2"] not in [None, ""]).map(_process_doc) + + return dataset.filter( + lambda doc: doc["sentence1"] not in [None, ""] + and doc["sentence2"] not in [None, ""] + ).map(_process_doc) def process_docs_copa_ca(dataset): @@ -101,6 +121,7 @@ def _process_doc(doc): doc["choice1"] = lowercase_first_letter(doc["choice1"]) doc["choice2"] = lowercase_first_letter(doc["choice2"]) return doc + return dataset.map(_process_doc) @@ -119,4 +140,3 @@ def rouge1_agg(items): preds = list(zip(*items))[1] rouge_scorer = evaluate.load("rouge") return rouge_scorer.compute(predictions=preds, references=refs)["rouge1"] - diff --git a/lm_eval/tasks/catalan_bench/wnli_ca.yaml b/lm_eval/tasks/catalan_bench/wnli_ca.yaml index ba9d8b1e8a..d4deec5c04 100644 --- a/lm_eval/tasks/catalan_bench/wnli_ca.yaml +++ b/lm_eval/tasks/catalan_bench/wnli_ca.yaml @@ -11,4 +11,4 @@ doc_to_choice: ["Fals", "Cert"] metric_list: - metric: acc metadata: - version: 1.0 \ No newline at end of file + version: 1.0 diff --git a/lm_eval/tasks/catalan_bench/xnli_ca.yaml b/lm_eval/tasks/catalan_bench/xnli_ca.yaml index 959b4775d1..44f0f44302 100644 --- a/lm_eval/tasks/catalan_bench/xnli_ca.yaml +++ b/lm_eval/tasks/catalan_bench/xnli_ca.yaml @@ -1,5 +1,3 @@ -group: - - xnli task: xnli_ca dataset_path: projecte-aina/xnli-ca dataset_name: null @@ -18,4 +16,4 @@ metric_list: aggregation: mean higher_is_better: true metadata: - version: 1.0 \ No newline at end of file + version: 1.0 diff --git a/lm_eval/tasks/catalan_bench/xquad_ca.yaml b/lm_eval/tasks/catalan_bench/xquad_ca.yaml index e70a59a978..9b72c7da74 100644 --- a/lm_eval/tasks/catalan_bench/xquad_ca.yaml +++ b/lm_eval/tasks/catalan_bench/xquad_ca.yaml @@ -8,7 +8,6 @@ validation_split: null test_split: test target_delimiter: ' ' process_results: !function utils.process_results_qa -test_split: test generation_kwargs: until: - "\n" @@ -22,4 +21,4 @@ metric_list: aggregation: mean higher_is_better: true metadata: - version: 1.0 \ No newline at end of file + version: 1.0 diff --git a/lm_eval/tasks/catalan_bench/xstorycloze_ca.yaml b/lm_eval/tasks/catalan_bench/xstorycloze_ca.yaml index 11491d9267..61a7c2991f 100644 --- a/lm_eval/tasks/catalan_bench/xstorycloze_ca.yaml +++ b/lm_eval/tasks/catalan_bench/xstorycloze_ca.yaml @@ -14,4 +14,4 @@ metric_list: aggregation: mean higher_is_better: true metadata: - version: 1.0 \ No newline at end of file + version: 1.0 From 43c9a497b9805ec48e1593aee88c1a8d14a8a096 Mon Sep 17 00:00:00 2001 From: zxcvuser Date: Mon, 30 Sep 2024 17:30:45 +0200 Subject: [PATCH 4/4] Fix create_yamls_flores_ca.py --- .../flores_ca/create_yamls_flores_ca.py | 299 +++++++++++++++--- 1 file changed, 259 insertions(+), 40 deletions(-) diff --git a/lm_eval/tasks/catalan_bench/flores_ca/create_yamls_flores_ca.py b/lm_eval/tasks/catalan_bench/flores_ca/create_yamls_flores_ca.py index b83295f445..6125b97266 100644 --- a/lm_eval/tasks/catalan_bench/flores_ca/create_yamls_flores_ca.py +++ b/lm_eval/tasks/catalan_bench/flores_ca/create_yamls_flores_ca.py @@ -4,50 +4,256 @@ """ import argparse + import yaml -from langcodes import * -from itertools import * +from langcodes import Language -# utils -flatten = lambda l: list(itertools.chain(*l)) # constants _LANGUAGES = [ -"ace_Arab", "bam_Latn", "dzo_Tibt", "hin_Deva", "khm_Khmr", "mag_Deva", "pap_Latn", "sot_Latn", "tur_Latn", -"ace_Latn", "ban_Latn", "ell_Grek", "hne_Deva", "kik_Latn", "mai_Deva", "pbt_Arab", "spa_Latn", "twi_Latn", -"acm_Arab", "bel_Cyrl", "eng_Latn", "hrv_Latn", "kin_Latn", "mal_Mlym", "pes_Arab", "srd_Latn", "tzm_Tfng", -"acq_Arab", "bem_Latn", "epo_Latn", "hun_Latn", "kir_Cyrl", "mar_Deva", "plt_Latn", "srp_Cyrl", "uig_Arab", -"aeb_Arab", "ben_Beng", "est_Latn", "hye_Armn", "kmb_Latn", "min_Arab", "pol_Latn", "ssw_Latn", "ukr_Cyrl", -"afr_Latn", "bho_Deva", "eus_Latn", "ibo_Latn", "kmr_Latn", "min_Latn", "por_Latn", "sun_Latn", "umb_Latn", -"ajp_Arab", "bjn_Arab", "ewe_Latn", "ilo_Latn", "knc_Arab", "mkd_Cyrl", "prs_Arab", "swe_Latn", "urd_Arab", -"aka_Latn", "bjn_Latn", "fao_Latn", "ind_Latn", "knc_Latn", "mlt_Latn", "quy_Latn", "swh_Latn", "uzn_Latn", -"als_Latn", "bod_Tibt", "fij_Latn", "isl_Latn", "kon_Latn", "mni_Beng", "ron_Latn", "szl_Latn", "vec_Latn", -"amh_Ethi", "bos_Latn", "fin_Latn", "ita_Latn", "kor_Hang", "mos_Latn", "run_Latn", "tam_Taml", "vie_Latn", -"apc_Arab", "bug_Latn", "fon_Latn", "jav_Latn", "lao_Laoo", "mri_Latn", "rus_Cyrl", "taq_Latn", "war_Latn", -"arb_Arab", "bul_Cyrl", "fra_Latn", "jpn_Jpan", "lij_Latn", "mya_Mymr", "sag_Latn", "taq_Tfng", "wol_Latn", -"arb_Latn", "cat_Latn", "fur_Latn", "kab_Latn", "lim_Latn", "nld_Latn", "san_Deva", "tat_Cyrl", "xho_Latn", -"ars_Arab", "ceb_Latn", "fuv_Latn", "kac_Latn", "lin_Latn", "nno_Latn", "sat_Olck", "tel_Telu", "ydd_Hebr", -"ary_Arab", "ces_Latn", "gaz_Latn", "kam_Latn", "lit_Latn", "nob_Latn", "scn_Latn", "tgk_Cyrl", "yor_Latn", -"arz_Arab", "cjk_Latn", "gla_Latn", "kan_Knda", "lmo_Latn", "npi_Deva", "shn_Mymr", "tgl_Latn", "yue_Hant", -"asm_Beng", "ckb_Arab", "gle_Latn", "kas_Arab", "ltg_Latn", "nso_Latn", "sin_Sinh", "tha_Thai", "zho_Hans", -"ast_Latn", "crh_Latn", "glg_Latn", "kas_Deva", "ltz_Latn", "nus_Latn", "slk_Latn", "tir_Ethi", "zho_Hant", -"awa_Deva", "cym_Latn", "grn_Latn", "kat_Geor", "lua_Latn", "nya_Latn", "slv_Latn", "tpi_Latn", "zsm_Latn", -"ayr_Latn", "dan_Latn", "guj_Gujr", "kaz_Cyrl", "lug_Latn", "oci_Latn", "smo_Latn", "tsn_Latn", "zul_Latn", -"azb_Arab", "deu_Latn", "hat_Latn", "kbp_Latn", "luo_Latn", "ory_Orya", "sna_Latn", "tso_Latn", -"azj_Latn", "dik_Latn", "hau_Latn", "kea_Latn", "lus_Latn", "pag_Latn", "snd_Arab", "tuk_Latn", -"bak_Cyrl", "dyu_Latn", "heb_Hebr", "khk_Cyrl", "lvs_Latn", "pan_Guru", "som_Latn", "tum_Latn" + "ace_Arab", + "bam_Latn", + "dzo_Tibt", + "hin_Deva", + "khm_Khmr", + "mag_Deva", + "pap_Latn", + "sot_Latn", + "tur_Latn", + "ace_Latn", + "ban_Latn", + "ell_Grek", + "hne_Deva", + "kik_Latn", + "mai_Deva", + "pbt_Arab", + "spa_Latn", + "twi_Latn", + "acm_Arab", + "bel_Cyrl", + "eng_Latn", + "hrv_Latn", + "kin_Latn", + "mal_Mlym", + "pes_Arab", + "srd_Latn", + "tzm_Tfng", + "acq_Arab", + "bem_Latn", + "epo_Latn", + "hun_Latn", + "kir_Cyrl", + "mar_Deva", + "plt_Latn", + "srp_Cyrl", + "uig_Arab", + "aeb_Arab", + "ben_Beng", + "est_Latn", + "hye_Armn", + "kmb_Latn", + "min_Arab", + "pol_Latn", + "ssw_Latn", + "ukr_Cyrl", + "afr_Latn", + "bho_Deva", + "eus_Latn", + "ibo_Latn", + "kmr_Latn", + "min_Latn", + "por_Latn", + "sun_Latn", + "umb_Latn", + "ajp_Arab", + "bjn_Arab", + "ewe_Latn", + "ilo_Latn", + "knc_Arab", + "mkd_Cyrl", + "prs_Arab", + "swe_Latn", + "urd_Arab", + "aka_Latn", + "bjn_Latn", + "fao_Latn", + "ind_Latn", + "knc_Latn", + "mlt_Latn", + "quy_Latn", + "swh_Latn", + "uzn_Latn", + "als_Latn", + "bod_Tibt", + "fij_Latn", + "isl_Latn", + "kon_Latn", + "mni_Beng", + "ron_Latn", + "szl_Latn", + "vec_Latn", + "amh_Ethi", + "bos_Latn", + "fin_Latn", + "ita_Latn", + "kor_Hang", + "mos_Latn", + "run_Latn", + "tam_Taml", + "vie_Latn", + "apc_Arab", + "bug_Latn", + "fon_Latn", + "jav_Latn", + "lao_Laoo", + "mri_Latn", + "rus_Cyrl", + "taq_Latn", + "war_Latn", + "arb_Arab", + "bul_Cyrl", + "fra_Latn", + "jpn_Jpan", + "lij_Latn", + "mya_Mymr", + "sag_Latn", + "taq_Tfng", + "wol_Latn", + "arb_Latn", + "cat_Latn", + "fur_Latn", + "kab_Latn", + "lim_Latn", + "nld_Latn", + "san_Deva", + "tat_Cyrl", + "xho_Latn", + "ars_Arab", + "ceb_Latn", + "fuv_Latn", + "kac_Latn", + "lin_Latn", + "nno_Latn", + "sat_Olck", + "tel_Telu", + "ydd_Hebr", + "ary_Arab", + "ces_Latn", + "gaz_Latn", + "kam_Latn", + "lit_Latn", + "nob_Latn", + "scn_Latn", + "tgk_Cyrl", + "yor_Latn", + "arz_Arab", + "cjk_Latn", + "gla_Latn", + "kan_Knda", + "lmo_Latn", + "npi_Deva", + "shn_Mymr", + "tgl_Latn", + "yue_Hant", + "asm_Beng", + "ckb_Arab", + "gle_Latn", + "kas_Arab", + "ltg_Latn", + "nso_Latn", + "sin_Sinh", + "tha_Thai", + "zho_Hans", + "ast_Latn", + "crh_Latn", + "glg_Latn", + "kas_Deva", + "ltz_Latn", + "nus_Latn", + "slk_Latn", + "tir_Ethi", + "zho_Hant", + "awa_Deva", + "cym_Latn", + "grn_Latn", + "kat_Geor", + "lua_Latn", + "nya_Latn", + "slv_Latn", + "tpi_Latn", + "zsm_Latn", + "ayr_Latn", + "dan_Latn", + "guj_Gujr", + "kaz_Cyrl", + "lug_Latn", + "oci_Latn", + "smo_Latn", + "tsn_Latn", + "zul_Latn", + "azb_Arab", + "deu_Latn", + "hat_Latn", + "kbp_Latn", + "luo_Latn", + "ory_Orya", + "sna_Latn", + "tso_Latn", + "azj_Latn", + "dik_Latn", + "hau_Latn", + "kea_Latn", + "lus_Latn", + "pag_Latn", + "snd_Arab", + "tuk_Latn", + "bak_Cyrl", + "dyu_Latn", + "heb_Hebr", + "khk_Cyrl", + "lvs_Latn", + "pan_Guru", + "som_Latn", + "tum_Latn", +] +LANGUAGE_PAIRS = [ + (a, b) for idx, a in enumerate(_LANGUAGES) for b in _LANGUAGES[idx + 1 :] ] -LANGUAGE_PAIRS = [(a, b) for idx, a in enumerate(_LANGUAGES) for b in _LANGUAGES[idx + 1:]] -LANGUAGES_OF_INTEREST = ["cat_Latn", "spa_Latn", "eng_Latn", "glg_Latn", "eus_Latn", "ita_Latn", "deu_Latn", "por_Latn", "fra_Latn"] +LANGUAGES_OF_INTEREST = [ + "cat_Latn", + "spa_Latn", + "eng_Latn", + "glg_Latn", + "eus_Latn", + "ita_Latn", + "deu_Latn", + "por_Latn", + "fra_Latn", +] MAIN_LANG = "cat_Latn" -LANGUAGE_PAIRS = [(a, b) for (a, b) in LANGUAGE_PAIRS if a in LANGUAGES_OF_INTEREST and b in LANGUAGES_OF_INTEREST and "cat_Latn" in (a, b)] +LANGUAGE_PAIRS = [ + (a, b) + for (a, b) in LANGUAGE_PAIRS + if a in LANGUAGES_OF_INTEREST + and b in LANGUAGES_OF_INTEREST + and "cat_Latn" in (a, b) +] # auxiliary functions -code_to_language_name = lambda code: Language.make(language=Language.get(code)["language"]).display_name() -code_to_short_name = lambda code: Language.get(code)["language"] -jinja_var = lambda s: "{{" + s + "}}" # wrapper to avoid having to escape { } in format strings + +def code_to_language_name(code): + return Language.make(language=Language.get(code)["language"]).display_name() + + +def code_to_short_name(code): + return Language.get(code)["language"] + + +def jinja_var(s): + return "{{" + s + "}}" + def doc_to_text(src: str, tgt: str) -> str: src_name, tgt_name = map(code_to_language_name, [src, tgt]) @@ -56,12 +262,14 @@ def doc_to_text(src: str, tgt: str) -> str: {src_name} sentence: {jinja_var('sentence_' + src)} {tgt_name} sentence:""" -def doc_to_target(tgt: str) -> str: +def doc_to_target(tgt: str) -> str: return f"{jinja_var('sentence_' + tgt)}" + # main function + def gen_lang_yamls(output_dir: str, overwrite: bool) -> None: """ Generate a YAML file for each translation direction. @@ -69,20 +277,23 @@ def gen_lang_yamls(output_dir: str, overwrite: bool) -> None: err = [] for src, tgt in LANGUAGE_PAIRS: - # do both translation directions for each lang pair for src, tgt in [(src, tgt), (tgt, src)]: lang_pair_name = f"{code_to_short_name(src)}-{code_to_short_name(tgt)}" yaml_file_name = f"flores_{lang_pair_name}.yaml" try: - with open( f"{output_dir}/{yaml_file_name}", "w" if overwrite else "x", encoding="utf-8") as outfile: + with open( + f"{output_dir}/{yaml_file_name}", + "w" if overwrite else "x", + encoding="utf-8", + ) as outfile: print(f"Creating {yaml_file_name}...") outfile.write("# File generated by `create-yamls.py`\n") yaml.dump( { -# "group": [f"{BENCH_NAME}_bench", f"{BENCH_NAME}_bench_flores"], -# "group": "flores_ca", + # "group": [f"{BENCH_NAME}_bench", f"{BENCH_NAME}_bench_flores"], + # "group": "flores_ca", "include": "_flores_common_yaml", "task": f"flores_{lang_pair_name}", "doc_to_text": doc_to_text(src, tgt), @@ -105,11 +316,19 @@ def gen_lang_yamls(output_dir: str, overwrite: bool) -> None: def main() -> None: parser = argparse.ArgumentParser() - parser.add_argument("--overwrite", default=False, action="store_true", help="Overwrite files if they already exist") - parser.add_argument( "--output-dir", default=".", help="Directory to write yaml files to" ) + parser.add_argument( + "--overwrite", + default=False, + action="store_true", + help="Overwrite files if they already exist", + ) + parser.add_argument( + "--output-dir", default=".", help="Directory to write yaml files to" + ) args = parser.parse_args() gen_lang_yamls(output_dir=args.output_dir, overwrite=args.overwrite) + if __name__ == "__main__": main()