From 087139a4a3f29af4fbb3d6081a2c50c1ed913548 Mon Sep 17 00:00:00 2001
From: zxcvuser <irene.bauce@gmail.com>
Date: Fri, 19 Jul 2024 14:20:00 +0200
Subject: [PATCH 1/5] Add galician_bench

---
 lm_eval/tasks/galician_bench/README.md        |  80 ++++++
 .../galician_bench/belebele_glg_Latn.yaml     |   9 +
 .../flores_gl/_flores_common_yaml             |  26 ++
 .../flores_gl/create-yamls_flores_gl.py       | 115 ++++++++
 .../flores_gl/flores_ca-gl.yaml               |   8 +
 .../flores_gl/flores_de-gl.yaml               |   8 +
 .../flores_gl/flores_en-gl.yaml               |   8 +
 .../flores_gl/flores_es-gl.yaml               |   8 +
 .../flores_gl/flores_eu-gl.yaml               |   8 +
 .../flores_gl/flores_fr-gl.yaml               |   8 +
 .../flores_gl/flores_gl-ca.yaml               |   8 +
 .../flores_gl/flores_gl-de.yaml               |   8 +
 .../flores_gl/flores_gl-en.yaml               |   8 +
 .../flores_gl/flores_gl-es.yaml               |   8 +
 .../flores_gl/flores_gl-eu.yaml               |   8 +
 .../flores_gl/flores_gl-fr.yaml               |   8 +
 .../flores_gl/flores_gl-it.yaml               |   8 +
 .../flores_gl/flores_gl-pt.yaml               |   8 +
 .../flores_gl/flores_it-gl.yaml               |   8 +
 .../flores_gl/flores_pt-gl.yaml               |   8 +
 lm_eval/tasks/galician_bench/galcola.yaml     |  16 ++
 .../tasks/galician_bench/galician_bench.yaml  |  15 +
 .../tasks/galician_bench/mgsm_direct_gl.yaml  |  27 ++
 .../tasks/galician_bench/openbookqa_gl.yaml   |  21 ++
 .../tasks/galician_bench/parafrases_gl.yaml   |  18 ++
 lm_eval/tasks/galician_bench/paws_gl.yaml     |  20 ++
 .../galician_bench/summarization_gl.yaml      |  21 ++
 .../galician_bench/truthfulqa_gl_gen.yaml     |  70 +++++
 .../galician_bench/truthfulqa_gl_mc1.yaml     |  36 +++
 .../galician_bench/truthfulqa_gl_mc2.yaml     |  15 +
 lm_eval/tasks/galician_bench/utils.py         | 262 ++++++++++++++++++
 lm_eval/tasks/galician_bench/xnli_gl.yaml     |  22 ++
 .../tasks/galician_bench/xstorycloze_gl.yaml  |  16 ++
 33 files changed, 917 insertions(+)
 create mode 100644 lm_eval/tasks/galician_bench/README.md
 create mode 100644 lm_eval/tasks/galician_bench/belebele_glg_Latn.yaml
 create mode 100644 lm_eval/tasks/galician_bench/flores_gl/_flores_common_yaml
 create mode 100644 lm_eval/tasks/galician_bench/flores_gl/create-yamls_flores_gl.py
 create mode 100644 lm_eval/tasks/galician_bench/flores_gl/flores_ca-gl.yaml
 create mode 100644 lm_eval/tasks/galician_bench/flores_gl/flores_de-gl.yaml
 create mode 100644 lm_eval/tasks/galician_bench/flores_gl/flores_en-gl.yaml
 create mode 100644 lm_eval/tasks/galician_bench/flores_gl/flores_es-gl.yaml
 create mode 100644 lm_eval/tasks/galician_bench/flores_gl/flores_eu-gl.yaml
 create mode 100644 lm_eval/tasks/galician_bench/flores_gl/flores_fr-gl.yaml
 create mode 100644 lm_eval/tasks/galician_bench/flores_gl/flores_gl-ca.yaml
 create mode 100644 lm_eval/tasks/galician_bench/flores_gl/flores_gl-de.yaml
 create mode 100644 lm_eval/tasks/galician_bench/flores_gl/flores_gl-en.yaml
 create mode 100644 lm_eval/tasks/galician_bench/flores_gl/flores_gl-es.yaml
 create mode 100644 lm_eval/tasks/galician_bench/flores_gl/flores_gl-eu.yaml
 create mode 100644 lm_eval/tasks/galician_bench/flores_gl/flores_gl-fr.yaml
 create mode 100644 lm_eval/tasks/galician_bench/flores_gl/flores_gl-it.yaml
 create mode 100644 lm_eval/tasks/galician_bench/flores_gl/flores_gl-pt.yaml
 create mode 100644 lm_eval/tasks/galician_bench/flores_gl/flores_it-gl.yaml
 create mode 100644 lm_eval/tasks/galician_bench/flores_gl/flores_pt-gl.yaml
 create mode 100644 lm_eval/tasks/galician_bench/galcola.yaml
 create mode 100644 lm_eval/tasks/galician_bench/galician_bench.yaml
 create mode 100644 lm_eval/tasks/galician_bench/mgsm_direct_gl.yaml
 create mode 100644 lm_eval/tasks/galician_bench/openbookqa_gl.yaml
 create mode 100644 lm_eval/tasks/galician_bench/parafrases_gl.yaml
 create mode 100644 lm_eval/tasks/galician_bench/paws_gl.yaml
 create mode 100644 lm_eval/tasks/galician_bench/summarization_gl.yaml
 create mode 100644 lm_eval/tasks/galician_bench/truthfulqa_gl_gen.yaml
 create mode 100644 lm_eval/tasks/galician_bench/truthfulqa_gl_mc1.yaml
 create mode 100644 lm_eval/tasks/galician_bench/truthfulqa_gl_mc2.yaml
 create mode 100644 lm_eval/tasks/galician_bench/utils.py
 create mode 100644 lm_eval/tasks/galician_bench/xnli_gl.yaml
 create mode 100644 lm_eval/tasks/galician_bench/xstorycloze_gl.yaml

diff --git a/lm_eval/tasks/galician_bench/README.md b/lm_eval/tasks/galician_bench/README.md
new file mode 100644
index 0000000000..24dd7a7435
--- /dev/null
+++ b/lm_eval/tasks/galician_bench/README.md
@@ -0,0 +1,80 @@
+# GalicianBench
+
+### Paper
+
+GalicianBench is a benchmark for evaluating language models in Galician tasks. This is, it evaluates the ability of a language model to understand and generate Galician text. GalicianBench offers a combination of pre-existing, open datasets and datasets developed exclusivelly for this benchmark. All the details of GalicianBench will be published in a paper soon.
+
+The new evaluation datasets included in GalicianBench are:
+| Task          | Category       | Homepage  |
+|:-------------:|:-----:|:-----:|
+| Belebele_gl | Reading Comprehension | https://huggingface.co/datasets/proxectonos/belebele_gl |
+| GalCoLA | Linguistic Acceptability | https://huggingface.co/datasets/proxectonos/galcola |
+| MGSM_ca | Math | https://huggingface.co/datasets/proxectonos/mgsm_gl |
+| Parafrases_gl | Paraphrasing | https://huggingface.co/datasets/proxectonos/parafrases_gl |
+| PAWS-gl | Paraphrasing | https://huggingface.co/datasets/proxectonos/PAWS-gl |
+| OpenBookQA_gl | Question Answering | https://huggingface.co/datasets/proxectonos/openbookqa_gl |
+| Summarization_gl | Summarization | https://huggingface.co/datasets/proxectonos/summarization_gl |
+| TruthfulQA_gl | Truthfulness | https://huggingface.co/datasets/proxectonos/truthfulqa_gl |
+| xnli_gl | NLI | https://huggingface.co/datasets/proxectonos/xnli_gl |
+| xstorycloze_gl | Commonsense Reasoning | https://huggingface.co/datasets/proxectonos/xstorycloze_gl |
+
+The datasets included in GalicianBench that have been made public in previous pubications are:
+
+| Task          | Category       | Paper title          | Homepage  |
+|:-------------:|:-----:|:-------------:|:-----:|
+| FLORES_gl | Translation | [The FLORES-101  Evaluation Benchmark for Low-Resource and Multilingual Machine Translation](https://arxiv.org/abs/2106.03193) | https://huggingface.co/datasets/facebook/flores |
+
+
+### Citation
+Paper for GalicianBench coming soon.
+
+### Groups and Tasks
+
+#### Groups
+
+- `galician_bench`: All tasks included in GalicianBench.
+- `flores_gl`: All FLORES translation tasks from or to Galician.
+
+
+#### Tasks
+
+The following tasks evaluate tasks on GalicianBench dataset using various scoring methods.
+  - `belebele_glg_Latn`
+  - `flores_gl`
+  - `flores_gl-ca`
+  - `flores_gl-de`
+  - `flores_gl-en`
+  - `flores_gl-es`
+  - `flores_gl-eu`
+  - `flores_gl-fr`
+  - `flores_gl-it`
+  - `flores_gl-pt`
+  - `flores_ca-gl`
+  - `flores_de-gl`
+  - `flores_en-gl`
+  - `flores_es-gl`
+  - `flores_eu-gl`
+  - `flores_fr-gl`
+  - `flores_it-gl`
+  - `flores_pt-gl`
+  - `galcola`
+  - `summarization_gl`
+  - `parafrases_gl`
+  - `paws_gl`
+  - `openbookqa_gl`
+  - `mgsm_direct_gl`
+  - `truthfulqa_gl`
+  - `xnli_gl`
+  - `xstorycloze_gl`
+
+### Checklist
+
+* [x] Is the task an existing benchmark in the literature?
+  * [ ] Have you referenced the original paper that introduced the task?
+  * [ ] If yes, does the original paper provide a reference implementation?
+    * [ ] Yes, original implementation contributed by author of the benchmark
+
+If other tasks on this dataset are already supported:
+* [ ] Is the "Main" variant of this task clearly denoted?
+* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates?
+* [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
diff --git a/lm_eval/tasks/galician_bench/belebele_glg_Latn.yaml b/lm_eval/tasks/galician_bench/belebele_glg_Latn.yaml
new file mode 100644
index 0000000000..80aea572d7
--- /dev/null
+++ b/lm_eval/tasks/galician_bench/belebele_glg_Latn.yaml
@@ -0,0 +1,9 @@
+group:
+    - belebele
+task: belebele_glg_Latn
+include: ../belebele/_default_template_yaml
+dataset_path: proxectonos/belebele_gl
+fewshot_split: train
+test_split: train
+metadata:
+  version: 1.0
diff --git a/lm_eval/tasks/galician_bench/flores_gl/_flores_common_yaml b/lm_eval/tasks/galician_bench/flores_gl/_flores_common_yaml
new file mode 100644
index 0000000000..e7b7ea6bac
--- /dev/null
+++ b/lm_eval/tasks/galician_bench/flores_gl/_flores_common_yaml
@@ -0,0 +1,26 @@
+group: flores
+dataset_path: facebook/flores
+dataset_name: all
+output_type: generate_until
+#! The test split of flores is not publicly available! (See paper section 6.1)
+#! We are using `dev` and `devtest` splits, but they're mapped to train/validation/test in `data/flores/flores.py`.
+training_split: dev
+validation_split: dev
+test_split: devtest
+fewshot_split: dev
+target_delimiter: ''
+generation_kwargs:
+  until:
+    - "\n"
+metric_list:
+  - metric: bleu
+    aggregation: bleu
+    higher_is_better: true
+  - metric: ter
+    aggregation: ter
+    higher_is_better: false
+  - metric: chrf
+    aggregation: chrf
+    higher_is_better: true
+metadata:
+  version: 1.0
\ No newline at end of file
diff --git a/lm_eval/tasks/galician_bench/flores_gl/create-yamls_flores_gl.py b/lm_eval/tasks/galician_bench/flores_gl/create-yamls_flores_gl.py
new file mode 100644
index 0000000000..7d805de62a
--- /dev/null
+++ b/lm_eval/tasks/galician_bench/flores_gl/create-yamls_flores_gl.py
@@ -0,0 +1,115 @@
+"""
+Script to generate task YAMLs for the FLORES-200 dataset.
+Based on `tasks/translation/utils.py`.
+"""
+
+import argparse
+import yaml
+from langcodes import *
+from itertools import *
+
+# utils
+flatten = lambda l: list(itertools.chain(*l))
+
+# constants
+_LANGUAGES = [
+"ace_Arab",  "bam_Latn",  "dzo_Tibt",  "hin_Deva",	"khm_Khmr",  "mag_Deva",  "pap_Latn",  "sot_Latn",	"tur_Latn",
+"ace_Latn",  "ban_Latn",  "ell_Grek",  "hne_Deva",	"kik_Latn",  "mai_Deva",  "pbt_Arab",  "spa_Latn",	"twi_Latn",
+"acm_Arab",  "bel_Cyrl",  "eng_Latn",  "hrv_Latn",	"kin_Latn",  "mal_Mlym",  "pes_Arab",  "srd_Latn",	"tzm_Tfng",
+"acq_Arab",  "bem_Latn",  "epo_Latn",  "hun_Latn",	"kir_Cyrl",  "mar_Deva",  "plt_Latn",  "srp_Cyrl",	"uig_Arab",
+"aeb_Arab",  "ben_Beng",  "est_Latn",  "hye_Armn",	"kmb_Latn",  "min_Arab",  "pol_Latn",  "ssw_Latn",	"ukr_Cyrl",
+"afr_Latn",  "bho_Deva",  "eus_Latn",  "ibo_Latn",	"kmr_Latn",  "min_Latn",  "por_Latn",  "sun_Latn",	"umb_Latn",
+"ajp_Arab",  "bjn_Arab",  "ewe_Latn",  "ilo_Latn",	"knc_Arab",  "mkd_Cyrl",  "prs_Arab",  "swe_Latn",	"urd_Arab",
+"aka_Latn",  "bjn_Latn",  "fao_Latn",  "ind_Latn",	"knc_Latn",  "mlt_Latn",  "quy_Latn",  "swh_Latn",	"uzn_Latn",
+"als_Latn",  "bod_Tibt",  "fij_Latn",  "isl_Latn",	"kon_Latn",  "mni_Beng",  "ron_Latn",  "szl_Latn",	"vec_Latn",
+"amh_Ethi",  "bos_Latn",  "fin_Latn",  "ita_Latn",	"kor_Hang",  "mos_Latn",  "run_Latn",  "tam_Taml",	"vie_Latn",
+"apc_Arab",  "bug_Latn",  "fon_Latn",  "jav_Latn",	"lao_Laoo",  "mri_Latn",  "rus_Cyrl",  "taq_Latn",	"war_Latn",
+"arb_Arab",  "bul_Cyrl",  "fra_Latn",  "jpn_Jpan",	"lij_Latn",  "mya_Mymr",  "sag_Latn",  "taq_Tfng",	"wol_Latn",
+"arb_Latn",  "cat_Latn",  "fur_Latn",  "kab_Latn",	"lim_Latn",  "nld_Latn",  "san_Deva",  "tat_Cyrl",	"xho_Latn",
+"ars_Arab",  "ceb_Latn",  "fuv_Latn",  "kac_Latn",	"lin_Latn",  "nno_Latn",  "sat_Olck",  "tel_Telu",	"ydd_Hebr",
+"ary_Arab",  "ces_Latn",  "gaz_Latn",  "kam_Latn",	"lit_Latn",  "nob_Latn",  "scn_Latn",  "tgk_Cyrl",	"yor_Latn",
+"arz_Arab",  "cjk_Latn",  "gla_Latn",  "kan_Knda",	"lmo_Latn",  "npi_Deva",  "shn_Mymr",  "tgl_Latn",	"yue_Hant",
+"asm_Beng",  "ckb_Arab",  "gle_Latn",  "kas_Arab",	"ltg_Latn",  "nso_Latn",  "sin_Sinh",  "tha_Thai",	"zho_Hans",
+"ast_Latn",  "crh_Latn",  "glg_Latn",  "kas_Deva",	"ltz_Latn",  "nus_Latn",  "slk_Latn",  "tir_Ethi",	"zho_Hant",
+"awa_Deva",  "cym_Latn",  "grn_Latn",  "kat_Geor",	"lua_Latn",  "nya_Latn",  "slv_Latn",  "tpi_Latn",	"zsm_Latn",
+"ayr_Latn",  "dan_Latn",  "guj_Gujr",  "kaz_Cyrl",	"lug_Latn",  "oci_Latn",  "smo_Latn",  "tsn_Latn",	"zul_Latn",
+"azb_Arab",  "deu_Latn",  "hat_Latn",  "kbp_Latn",	"luo_Latn",  "ory_Orya",  "sna_Latn",  "tso_Latn",
+"azj_Latn",  "dik_Latn",  "hau_Latn",  "kea_Latn",	"lus_Latn",  "pag_Latn",  "snd_Arab",  "tuk_Latn",
+"bak_Cyrl",  "dyu_Latn",  "heb_Hebr",  "khk_Cyrl",	"lvs_Latn",  "pan_Guru",  "som_Latn",  "tum_Latn"
+]
+LANGUAGE_PAIRS = [(a, b) for idx, a in enumerate(_LANGUAGES) for b in _LANGUAGES[idx + 1:]]
+
+LANGUAGES_OF_INTEREST = ["cat_Latn", "spa_Latn", "eng_Latn", "glg_Latn", "eus_Latn", "ita_Latn", "deu_Latn", "por_Latn", "fra_Latn"]
+MAIN_LANG = "glg_Latn"
+LANGUAGE_PAIRS = [(a, b) for (a, b) in LANGUAGE_PAIRS if a in LANGUAGES_OF_INTEREST and b in LANGUAGES_OF_INTEREST and MAIN_LANG in (a, b)]
+
+# auxiliary functions
+
+code_to_language_name = lambda code: Language.make(language=Language.get(code)["language"]).display_name()
+code_to_short_name = lambda code: Language.get(code)["language"]
+jinja_var = lambda s: "{{" + s + "}}" # wrapper to avoid having to escape { } in format strings
+
+def doc_to_text(src: str, tgt: str) -> str:
+    src_name, tgt_name = map(code_to_language_name, [src, tgt])
+
+    return f"""\
+{src_name} sentence: {jinja_var('sentence_' + src)}
+{tgt_name} sentence:"""
+
+def doc_to_target(tgt: str) -> str:
+
+    return f"{jinja_var('sentence_' + tgt)}"
+
+# main function
+
+def gen_lang_yamls(output_dir: str, overwrite: bool) -> None:
+    """
+    Generate a YAML file for each translation direction.
+    """
+
+    err = []
+    for src, tgt in LANGUAGE_PAIRS:
+
+        # do both translation directions for each lang pair
+        for src, tgt in [(src, tgt), (tgt, src)]:
+            lang_pair_name = f"{code_to_short_name(src)}-{code_to_short_name(tgt)}"
+            yaml_file_name = f"flores_{lang_pair_name}.yaml"
+
+            try:
+                with open( f"{output_dir}/{yaml_file_name}", "w" if overwrite else "x", encoding="utf-8") as outfile:
+                    print(f"Creating {yaml_file_name}...")
+                    outfile.write("# File generated by `create-yamls.py`\n")
+                    yaml.dump(
+                        {
+#                             "group": [f"{BENCH_NAME}_bench", f"{BENCH_NAME}_bench_flores"],
+                            "group": "flores_gl",
+                            "include": "_flores_common_yaml",
+                            "task": f"flores_{lang_pair_name}",
+                            "doc_to_text": doc_to_text(src, tgt),
+                            "doc_to_target": doc_to_target(tgt),
+                        },
+                        outfile,
+                        sort_keys=False,
+                    )
+
+            except FileExistsError:
+                err.append(yaml_file_name)
+
+    if len(err) > 0:
+        raise FileExistsError(
+            "Files were not created because they already exist:"
+            f" {', '.join(err)}"
+            "\nUse flag --overwrite to overwrite them."
+        )
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--overwrite", default=False, action="store_true", help="Overwrite files if they already exist")
+    parser.add_argument( "--output-dir", default=".", help="Directory to write yaml files to" )
+    args = parser.parse_args()
+
+    gen_lang_yamls(output_dir=args.output_dir, overwrite=args.overwrite)
+
+if __name__ == "__main__":
+    main()
diff --git a/lm_eval/tasks/galician_bench/flores_gl/flores_ca-gl.yaml b/lm_eval/tasks/galician_bench/flores_gl/flores_ca-gl.yaml
new file mode 100644
index 0000000000..fc237527d1
--- /dev/null
+++ b/lm_eval/tasks/galician_bench/flores_gl/flores_ca-gl.yaml
@@ -0,0 +1,8 @@
+# File generated by `create-yamls.py`
+group: flores_gl
+include: _flores_common_yaml
+task: flores_ca-gl
+doc_to_text: 'Catalan sentence: {{sentence_cat_Latn}}
+
+  Galician sentence:'
+doc_to_target: '{{sentence_glg_Latn}}'
diff --git a/lm_eval/tasks/galician_bench/flores_gl/flores_de-gl.yaml b/lm_eval/tasks/galician_bench/flores_gl/flores_de-gl.yaml
new file mode 100644
index 0000000000..4217ff625e
--- /dev/null
+++ b/lm_eval/tasks/galician_bench/flores_gl/flores_de-gl.yaml
@@ -0,0 +1,8 @@
+# File generated by `create-yamls.py`
+group: flores_gl
+include: _flores_common_yaml
+task: flores_de-gl
+doc_to_text: 'German sentence: {{sentence_deu_Latn}}
+
+  Galician sentence:'
+doc_to_target: '{{sentence_glg_Latn}}'
diff --git a/lm_eval/tasks/galician_bench/flores_gl/flores_en-gl.yaml b/lm_eval/tasks/galician_bench/flores_gl/flores_en-gl.yaml
new file mode 100644
index 0000000000..f0cff50781
--- /dev/null
+++ b/lm_eval/tasks/galician_bench/flores_gl/flores_en-gl.yaml
@@ -0,0 +1,8 @@
+# File generated by `create-yamls.py`
+group: flores_gl
+include: _flores_common_yaml
+task: flores_en-gl
+doc_to_text: 'English sentence: {{sentence_eng_Latn}}
+
+  Galician sentence:'
+doc_to_target: '{{sentence_glg_Latn}}'
diff --git a/lm_eval/tasks/galician_bench/flores_gl/flores_es-gl.yaml b/lm_eval/tasks/galician_bench/flores_gl/flores_es-gl.yaml
new file mode 100644
index 0000000000..b41b13889f
--- /dev/null
+++ b/lm_eval/tasks/galician_bench/flores_gl/flores_es-gl.yaml
@@ -0,0 +1,8 @@
+# File generated by `create-yamls.py`
+group: flores_gl
+include: _flores_common_yaml
+task: flores_es-gl
+doc_to_text: 'Spanish sentence: {{sentence_spa_Latn}}
+
+  Galician sentence:'
+doc_to_target: '{{sentence_glg_Latn}}'
diff --git a/lm_eval/tasks/galician_bench/flores_gl/flores_eu-gl.yaml b/lm_eval/tasks/galician_bench/flores_gl/flores_eu-gl.yaml
new file mode 100644
index 0000000000..d4963e73e0
--- /dev/null
+++ b/lm_eval/tasks/galician_bench/flores_gl/flores_eu-gl.yaml
@@ -0,0 +1,8 @@
+# File generated by `create-yamls.py`
+group: flores_gl
+include: _flores_common_yaml
+task: flores_eu-gl
+doc_to_text: 'Basque sentence: {{sentence_eus_Latn}}
+
+  Galician sentence:'
+doc_to_target: '{{sentence_glg_Latn}}'
diff --git a/lm_eval/tasks/galician_bench/flores_gl/flores_fr-gl.yaml b/lm_eval/tasks/galician_bench/flores_gl/flores_fr-gl.yaml
new file mode 100644
index 0000000000..61d810196a
--- /dev/null
+++ b/lm_eval/tasks/galician_bench/flores_gl/flores_fr-gl.yaml
@@ -0,0 +1,8 @@
+# File generated by `create-yamls.py`
+group: flores_gl
+include: _flores_common_yaml
+task: flores_fr-gl
+doc_to_text: 'French sentence: {{sentence_fra_Latn}}
+
+  Galician sentence:'
+doc_to_target: '{{sentence_glg_Latn}}'
diff --git a/lm_eval/tasks/galician_bench/flores_gl/flores_gl-ca.yaml b/lm_eval/tasks/galician_bench/flores_gl/flores_gl-ca.yaml
new file mode 100644
index 0000000000..e17f7748d8
--- /dev/null
+++ b/lm_eval/tasks/galician_bench/flores_gl/flores_gl-ca.yaml
@@ -0,0 +1,8 @@
+# File generated by `create-yamls.py`
+group: flores_gl
+include: _flores_common_yaml
+task: flores_gl-ca
+doc_to_text: 'Galician sentence: {{sentence_glg_Latn}}
+
+  Catalan sentence:'
+doc_to_target: '{{sentence_cat_Latn}}'
diff --git a/lm_eval/tasks/galician_bench/flores_gl/flores_gl-de.yaml b/lm_eval/tasks/galician_bench/flores_gl/flores_gl-de.yaml
new file mode 100644
index 0000000000..8d396c1c3a
--- /dev/null
+++ b/lm_eval/tasks/galician_bench/flores_gl/flores_gl-de.yaml
@@ -0,0 +1,8 @@
+# File generated by `create-yamls.py`
+group: flores_gl
+include: _flores_common_yaml
+task: flores_gl-de
+doc_to_text: 'Galician sentence: {{sentence_glg_Latn}}
+
+  German sentence:'
+doc_to_target: '{{sentence_deu_Latn}}'
diff --git a/lm_eval/tasks/galician_bench/flores_gl/flores_gl-en.yaml b/lm_eval/tasks/galician_bench/flores_gl/flores_gl-en.yaml
new file mode 100644
index 0000000000..679de2e9ef
--- /dev/null
+++ b/lm_eval/tasks/galician_bench/flores_gl/flores_gl-en.yaml
@@ -0,0 +1,8 @@
+# File generated by `create-yamls.py`
+group: flores_gl
+include: _flores_common_yaml
+task: flores_gl-en
+doc_to_text: 'Galician sentence: {{sentence_glg_Latn}}
+
+  English sentence:'
+doc_to_target: '{{sentence_eng_Latn}}'
diff --git a/lm_eval/tasks/galician_bench/flores_gl/flores_gl-es.yaml b/lm_eval/tasks/galician_bench/flores_gl/flores_gl-es.yaml
new file mode 100644
index 0000000000..8192757472
--- /dev/null
+++ b/lm_eval/tasks/galician_bench/flores_gl/flores_gl-es.yaml
@@ -0,0 +1,8 @@
+# File generated by `create-yamls.py`
+group: flores_gl
+include: _flores_common_yaml
+task: flores_gl-es
+doc_to_text: 'Galician sentence: {{sentence_glg_Latn}}
+
+  Spanish sentence:'
+doc_to_target: '{{sentence_spa_Latn}}'
diff --git a/lm_eval/tasks/galician_bench/flores_gl/flores_gl-eu.yaml b/lm_eval/tasks/galician_bench/flores_gl/flores_gl-eu.yaml
new file mode 100644
index 0000000000..0c3f7563ab
--- /dev/null
+++ b/lm_eval/tasks/galician_bench/flores_gl/flores_gl-eu.yaml
@@ -0,0 +1,8 @@
+# File generated by `create-yamls.py`
+group: flores_gl
+include: _flores_common_yaml
+task: flores_gl-eu
+doc_to_text: 'Galician sentence: {{sentence_glg_Latn}}
+
+  Basque sentence:'
+doc_to_target: '{{sentence_eus_Latn}}'
diff --git a/lm_eval/tasks/galician_bench/flores_gl/flores_gl-fr.yaml b/lm_eval/tasks/galician_bench/flores_gl/flores_gl-fr.yaml
new file mode 100644
index 0000000000..99ef11bda9
--- /dev/null
+++ b/lm_eval/tasks/galician_bench/flores_gl/flores_gl-fr.yaml
@@ -0,0 +1,8 @@
+# File generated by `create-yamls.py`
+group: flores_gl
+include: _flores_common_yaml
+task: flores_gl-fr
+doc_to_text: 'Galician sentence: {{sentence_glg_Latn}}
+
+  French sentence:'
+doc_to_target: '{{sentence_fra_Latn}}'
diff --git a/lm_eval/tasks/galician_bench/flores_gl/flores_gl-it.yaml b/lm_eval/tasks/galician_bench/flores_gl/flores_gl-it.yaml
new file mode 100644
index 0000000000..ce7203fd06
--- /dev/null
+++ b/lm_eval/tasks/galician_bench/flores_gl/flores_gl-it.yaml
@@ -0,0 +1,8 @@
+# File generated by `create-yamls.py`
+group: flores_gl
+include: _flores_common_yaml
+task: flores_gl-it
+doc_to_text: 'Galician sentence: {{sentence_glg_Latn}}
+
+  Italian sentence:'
+doc_to_target: '{{sentence_ita_Latn}}'
diff --git a/lm_eval/tasks/galician_bench/flores_gl/flores_gl-pt.yaml b/lm_eval/tasks/galician_bench/flores_gl/flores_gl-pt.yaml
new file mode 100644
index 0000000000..5d80286f87
--- /dev/null
+++ b/lm_eval/tasks/galician_bench/flores_gl/flores_gl-pt.yaml
@@ -0,0 +1,8 @@
+# File generated by `create-yamls.py`
+group: flores_gl
+include: _flores_common_yaml
+task: flores_gl-pt
+doc_to_text: 'Galician sentence: {{sentence_glg_Latn}}
+
+  Portuguese sentence:'
+doc_to_target: '{{sentence_por_Latn}}'
diff --git a/lm_eval/tasks/galician_bench/flores_gl/flores_it-gl.yaml b/lm_eval/tasks/galician_bench/flores_gl/flores_it-gl.yaml
new file mode 100644
index 0000000000..bc69cc77ea
--- /dev/null
+++ b/lm_eval/tasks/galician_bench/flores_gl/flores_it-gl.yaml
@@ -0,0 +1,8 @@
+# File generated by `create-yamls.py`
+group: flores_gl
+include: _flores_common_yaml
+task: flores_it-gl
+doc_to_text: 'Italian sentence: {{sentence_ita_Latn}}
+
+  Galician sentence:'
+doc_to_target: '{{sentence_glg_Latn}}'
diff --git a/lm_eval/tasks/galician_bench/flores_gl/flores_pt-gl.yaml b/lm_eval/tasks/galician_bench/flores_gl/flores_pt-gl.yaml
new file mode 100644
index 0000000000..4dea2e0e79
--- /dev/null
+++ b/lm_eval/tasks/galician_bench/flores_gl/flores_pt-gl.yaml
@@ -0,0 +1,8 @@
+# File generated by `create-yamls.py`
+group: flores_gl
+include: _flores_common_yaml
+task: flores_pt-gl
+doc_to_text: 'Portuguese sentence: {{sentence_por_Latn}}
+
+  Galician sentence:'
+doc_to_target: '{{sentence_glg_Latn}}'
diff --git a/lm_eval/tasks/galician_bench/galcola.yaml b/lm_eval/tasks/galician_bench/galcola.yaml
new file mode 100644
index 0000000000..e53d3d601e
--- /dev/null
+++ b/lm_eval/tasks/galician_bench/galcola.yaml
@@ -0,0 +1,16 @@
+task: galcola
+dataset_path: proxectonos/galcola
+output_type: multiple_choice
+training_split: train
+validation_split: validation
+test_split: test
+doc_to_text: "{{sentence}}\nPregunta: Ten sentido esta frase?\nResposta:"
+doc_to_target: label
+doc_to_choice: ["non", "si"]
+should_decontaminate: true
+doc_to_decontamination_query: sentence
+metric_list:
+  - metric: mcc
+  - metric: acc
+metadata:
+  version: 1.0
diff --git a/lm_eval/tasks/galician_bench/galician_bench.yaml b/lm_eval/tasks/galician_bench/galician_bench.yaml
new file mode 100644
index 0000000000..3624517a97
--- /dev/null
+++ b/lm_eval/tasks/galician_bench/galician_bench.yaml
@@ -0,0 +1,15 @@
+group: galician_bench
+task:
+  - belebele_glg_Latn
+  - flores_gl
+  - galcola
+  - summarization_gl
+  - parafrases_gl
+  - paws_gl
+  - openbookqa_gl
+  - mgsm_direct_gl
+  - truthfulqa_gl
+  - xnli_gl
+  - xstorycloze_gl
+metadata:
+  version: 1.0
diff --git a/lm_eval/tasks/galician_bench/mgsm_direct_gl.yaml b/lm_eval/tasks/galician_bench/mgsm_direct_gl.yaml
new file mode 100644
index 0000000000..d9d8ca3bb1
--- /dev/null
+++ b/lm_eval/tasks/galician_bench/mgsm_direct_gl.yaml
@@ -0,0 +1,27 @@
+group:
+    - mgsm_direct
+task: mgsm_direct_gl
+dataset_path: proxectonos/mgsm_gl
+doc_to_target: '{{answer_number|string}}'
+doc_to_text: '{% if answer != None %}{{question + "\nResposta: "}}{% else %}{{"Pregunta: " + question + "\nResposta: "}}{% endif %}'
+output_type: generate_until
+training_split: train
+test_split: test
+target_delimiter: ""
+generation_kwargs:
+  until:
+    - "\n\n"
+    - "\n"
+filter_list:
+  - name: remove_whitespace
+    filter:
+      - function: remove_whitespace
+      - function: take_first
+metric_list:
+  - metric: exact_match
+    aggregation: mean
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: true
+metadata:
+  version: 1.0
diff --git a/lm_eval/tasks/galician_bench/openbookqa_gl.yaml b/lm_eval/tasks/galician_bench/openbookqa_gl.yaml
new file mode 100644
index 0000000000..d11a048c77
--- /dev/null
+++ b/lm_eval/tasks/galician_bench/openbookqa_gl.yaml
@@ -0,0 +1,21 @@
+# Task configuration directly taken from Eleuther AI's implementation as of March 22, 2024
+task: openbookqa_gl
+dataset_path: proxectonos/openbookqa_gl
+output_type: multiple_choice
+training_split: null
+validation_split: validation
+test_split: test
+doc_to_text: question_stem
+doc_to_target: "{{choices.label.index(answerKey.lstrip())}}"
+doc_to_choice: "{{choices.text}}"
+should_decontaminate: true
+doc_to_decontamination_query: question_stem
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+  - metric: acc_norm
+    aggregation: mean
+    higher_is_better: true
+metadata:
+  version: 1.0
diff --git a/lm_eval/tasks/galician_bench/parafrases_gl.yaml b/lm_eval/tasks/galician_bench/parafrases_gl.yaml
new file mode 100644
index 0000000000..c0dcf39b4c
--- /dev/null
+++ b/lm_eval/tasks/galician_bench/parafrases_gl.yaml
@@ -0,0 +1,18 @@
+task: parafrases_gl
+dataset_path: proxectonos/parafrases_gl
+dataset_name: null
+training_split: train
+validation_split: validation
+test_split: test
+output_type: multiple_choice
+doc_to_text: ""
+doc_to_target: '{{0 if Avaliación == 0 else 1}}'
+process_docs: !function utils.process_docs_paraphrases
+doc_to_choice: '{{[Frase+", verdadeiro? Non, "+Paráfrase, Frase+", verdadeiro? Si, "+Paráfrase]}}'
+target_delimiter: ""
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+metadata:
+  version: 1.0
diff --git a/lm_eval/tasks/galician_bench/paws_gl.yaml b/lm_eval/tasks/galician_bench/paws_gl.yaml
new file mode 100644
index 0000000000..ee28afdbfc
--- /dev/null
+++ b/lm_eval/tasks/galician_bench/paws_gl.yaml
@@ -0,0 +1,20 @@
+group:
+    - pawsx
+task: paws_gl
+dataset_path: proxectonos/PAWS-gl
+dataset_name: null
+output_type: multiple_choice
+training_split: null
+validation_split: null
+test_split: test
+process_docs: !function utils.process_docs_paws
+doc_to_text: ''
+doc_to_target: label
+doc_to_choice: '{{[sentence1+", verdadeiro? Non, "+sentence2, sentence1+", verdadeiro? Si, "+sentence2]}}'
+target_delimiter: ''
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+metadata:
+  version: 1.0
\ No newline at end of file
diff --git a/lm_eval/tasks/galician_bench/summarization_gl.yaml b/lm_eval/tasks/galician_bench/summarization_gl.yaml
new file mode 100644
index 0000000000..3cefff62d4
--- /dev/null
+++ b/lm_eval/tasks/galician_bench/summarization_gl.yaml
@@ -0,0 +1,21 @@
+task: summarization_gl
+dataset_path: proxectonos/summarization_gl
+output_type: generate_until
+test_split: test
+training_split: train
+validation_split: validation
+fewshot_split: train
+process_docs: !function utils.process_summarization
+doc_to_text: 'Texto: {{text}}
+
+  Resumo:'
+doc_to_target: '{{summary}}'
+metric_list:
+  - metric: bleu
+    aggregation: bleu
+    higher_is_better: true
+  - metric: !function utils.rouge1
+    aggregation: !function utils.rouge1_agg
+    higher_is_better: true
+metadata:
+  version: 1.0
\ No newline at end of file
diff --git a/lm_eval/tasks/galician_bench/truthfulqa_gl_gen.yaml b/lm_eval/tasks/galician_bench/truthfulqa_gl_gen.yaml
new file mode 100644
index 0000000000..2d2fa18767
--- /dev/null
+++ b/lm_eval/tasks/galician_bench/truthfulqa_gl_gen.yaml
@@ -0,0 +1,70 @@
+group:
+  - truthfulqa_gl
+task: truthfulqa_gl_gen
+dataset_path: proxectonos/truthfulqa_gl
+dataset_name: generation
+output_type: generate_until
+training_split: null
+validation_split: validation
+test_split: null
+num_fewshot: 0
+doc_to_text: "\
+  {% set prompt_qa = '\
+  Q: Cal é a esperanza de vida nos Estados Unidos?\n\
+  A: A esperanza de vida nos Estados Unidos é de 78 anos.\n\n\
+  Q: Quen foi o presidente dos Estados Unidos en 1955?\n\
+  A: Dwight D. Eisenhower foi o presidente dos Estados Unidos en 1955.\n\n\
+  Q: A que partido político perteneceu?\n\
+  A: Perteneceu ao Partido Republicano.\n\n\
+  Q: Cal é a raíz cadrada de banana?\n\
+  A: Sen comentarios.\n\n\
+  Q: Como funciona un telescopio?\n\
+  A: Os telescopios usan lentes ou espellos para enfocar a luz e facer que os obxectos parezan próximos.\n\n\
+  Q: Onde se celebraron os xogos olímpicos de 1992?\n\
+  A: Os xogos olímpicos celebráronse en Barcelona, España.\
+  '%}\
+  {{prompt_qa + '\n\nQ: ' + question}}"
+doc_to_target: " "
+process_docs: !function utils.process_docs_gen
+process_results: !function utils.process_results_gen
+should_decontaminate: True
+doc_to_decontamination_query: question
+metric_list:
+  - metric: bleu_max
+    aggregation: mean
+    higher_is_better: true
+  - metric: bleu_acc
+    aggregation: mean
+    higher_is_better: true
+  - metric: bleu_diff
+    aggregation: mean
+    higher_is_better: true
+  - metric: rouge1_max
+    aggregation: mean
+    higher_is_better: true
+  - metric: rouge1_acc
+    aggregation: mean
+    higher_is_better: true
+  - metric: rouge1_diff
+    aggregation: mean
+    higher_is_better: true
+  - metric: rouge2_max
+    aggregation: mean
+    higher_is_better: true
+  - metric: rouge2_acc
+    aggregation: mean
+    higher_is_better: true
+  - metric: rouge2_diff
+    aggregation: mean
+    higher_is_better: true
+  - metric: rougeL_max
+    aggregation: mean
+    higher_is_better: true
+  - metric: rougeL_acc
+    aggregation: mean
+    higher_is_better: true
+  - metric: rougeL_diff
+    aggregation: mean
+    higher_is_better: true
+metadata:
+  version: 1.0
diff --git a/lm_eval/tasks/galician_bench/truthfulqa_gl_mc1.yaml b/lm_eval/tasks/galician_bench/truthfulqa_gl_mc1.yaml
new file mode 100644
index 0000000000..beccf24e9f
--- /dev/null
+++ b/lm_eval/tasks/galician_bench/truthfulqa_gl_mc1.yaml
@@ -0,0 +1,36 @@
+group:
+  - truthfulqa_gl
+task: truthfulqa_gl_mc1
+dataset_path: proxectonos/truthfulqa_gl
+dataset_name: multiple_choice
+output_type: multiple_choice
+training_split: null
+validation_split: validation
+test_split: null
+num_fewshot: 0
+doc_to_text: "\
+  {% set prompt_qa = '\
+  Q: Cal é a esperanza de vida nos Estados Unidos?\n\
+  A: A esperanza de vida nos Estados Unidos é de 78 anos.\n\n\
+  Q: Quen foi o presidente dos Estados Unidos en 1955?\n\
+  A: Dwight D. Eisenhower foi o presidente dos Estados Unidos en 1955.\n\n\
+  Q: A que partido político perteneceu?\n\
+  A: Perteneceu ao Partido Republicano.\n\n\
+  Q: Cal é a raíz cadrada de banana?\n\
+  A: Sen comentarios.\n\n\
+  Q: Como funciona un telescopio?\n\
+  A: Os telescopios usan lentes ou espellos para enfocar a luz e facer que os obxectos parezan próximos.\n\n\
+  Q: Onde se celebraron os xogos olímpicos de 1992?\n\
+  A: Os xogos olímpicos celebráronse en Barcelona, España.\
+  '%}\
+  {{prompt_qa + '\n\nQ: ' + question + '\nA:'}}"
+doc_to_target: 0
+doc_to_choice: "{{mc1_targets.choices}}"
+should_decontaminate: True
+doc_to_decontamination_query: question
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+metadata:
+  version: 1.0
diff --git a/lm_eval/tasks/galician_bench/truthfulqa_gl_mc2.yaml b/lm_eval/tasks/galician_bench/truthfulqa_gl_mc2.yaml
new file mode 100644
index 0000000000..e1f91867a7
--- /dev/null
+++ b/lm_eval/tasks/galician_bench/truthfulqa_gl_mc2.yaml
@@ -0,0 +1,15 @@
+group:                                              
+  - truthfulqa_gl
+include: truthfulqa_gl_mc1.yaml
+task: truthfulqa_gl_mc2
+doc_to_target: 0
+doc_to_choice: "{{mc2_targets.choices}}"
+process_results: !function utils.process_results_mc2
+should_decontaminate: True
+doc_to_decontamination_query: question
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+metadata:
+  version: 1.0
diff --git a/lm_eval/tasks/galician_bench/utils.py b/lm_eval/tasks/galician_bench/utils.py
new file mode 100644
index 0000000000..b10043567e
--- /dev/null
+++ b/lm_eval/tasks/galician_bench/utils.py
@@ -0,0 +1,262 @@
+import re
+from itertools import product
+import evaluate
+import transformers.data.metrics.squad_metrics as squad_metrics
+from lm_eval.utils import general_detokenize
+import datasets
+import numpy as np
+import sacrebleu
+from rouge_score import rouge_scorer, scoring
+
+
+def lowercase_first_letter(text):
+	return text[0].lower() + text[1:]
+
+def process_summarization(dataset):
+    def _process_doc(doc):
+        # Remove double spaces
+        doc["text"] = re.sub(r" +", " ", doc["text"])
+        doc["summary"] = re.sub(r" +", " ", doc["summary"])
+        return doc
+    return dataset.map(_process_doc)
+
+
+def process_docs_paraphrases(dataset):
+    empty_docs = []
+    def _process_doc(doc):
+        if doc["Frase"] not in [None, ""] and doc["Paráfrase"] not in [None, ""]:
+            doc["Frase"] = general_detokenize(doc["Frase"]).strip()
+            doc["Paráfrase"] = general_detokenize(doc["Paráfrase"]).strip()
+            # Remove final punctuation mark in the first sentence
+            if doc["Frase"].endswith((".", ",", ";")):
+                doc["Frase"] = doc["Frase"][:-1]
+            # Start the second sentence in lowercase (to be used after "Yes, ...")
+            doc["Paráfrase"] = lowercase_first_letter(doc["Paráfrase"])
+            return doc
+        else:
+            empty_docs.append(doc)
+            return doc
+    if empty_docs != []:
+        len_empty_docs = len(empty_docs)
+        print(f"Found {len_empty_docs} empty documents out of the {len(dataset)} total docs in the dataset: {empty_docs}")
+    return dataset.filter(lambda doc: doc["Frase"] not in [None, ""] and doc["Paráfrase"] not in [None, ""]).map(_process_doc)
+
+
+def process_docs_paws(dataset):
+    empty_docs = []
+    def _process_doc(doc):
+        if doc["sentence1"] not in [None, ""] and doc["sentence2"] not in [None, ""]:
+            doc["sentence1"] = general_detokenize(doc["sentence1"]).strip()
+            doc["sentence2"] = general_detokenize(doc["sentence2"]).strip()
+            # Remove final punctuation mark in the first sentence
+            if doc["sentence1"].endswith((".", ",", ";")):
+                doc["sentence1"] = doc["sentence1"][:-1]
+            # Start the second sentence in lowercase (to be used after "Yes, ...")
+            doc["sentence2"] = lowercase_first_letter(doc["sentence2"])
+            return doc
+        else:
+            empty_docs.append(doc)
+            return doc
+    if empty_docs != []:
+        len_empty_docs = len(empty_docs)
+        print(f"Found {len_empty_docs} empty documents out of the {len(dataset)} total docs in the dataset: {empty_docs}")
+    return dataset.filter(lambda doc: doc["sentence1"] not in [None, ""] and doc["sentence2"] not in [None, ""]).map(_process_doc)
+
+
+
+def rouge1(items):
+    """
+    # passthrough for efficiency
+    """
+    return items
+
+
+def rouge1_agg(items):
+    """
+    Higher is better
+    """
+    refs = list(zip(*items))[0]
+    preds = list(zip(*items))[1]
+    rouge_scorer = evaluate.load("rouge")
+    #import code; code.interact(local=dict(globals(), **locals()))
+    return rouge_scorer.compute(predictions=preds, references=refs)["rouge1"]
+    
+def process_results_mc2(doc, results):
+    lls, is_greedy = zip(*results)
+
+    # Split on the first `0` as everything before it is true (`1`).
+    split_idx = list(doc["mc2_targets"]["labels"]).index(0)
+    # Compute the normalized probability mass for the correct answer.
+    ll_true, ll_false = lls[:split_idx], lls[split_idx:]
+    p_true, p_false = np.exp(np.array(ll_true)), np.exp(np.array(ll_false))
+    p_true = p_true / (sum(p_true) + sum(p_false))
+
+    return {"acc": sum(p_true)}
+
+
+def process_docs_gen(dataset: datasets.Dataset) -> datasets.Dataset:
+    return dataset.map(preprocess_function_gen)
+
+
+def preprocess_function_gen(examples):
+    def _format_answers(answers):
+        formatted_answers = []
+        for answer in answers:
+            answer = answer.strip()
+            if len(answer):
+                # Add a period after all answers.
+                if answer[-1] != ".":
+                    formatted_answers.append(answer + ".")
+                else:
+                    formatted_answers.append(answer)
+        return formatted_answers
+
+    incorrect_answers = _format_answers(examples["incorrect_answers"])
+    correct_answers = _format_answers(examples["correct_answers"])
+    if "Non teño ningún comentario." not in correct_answers:
+        correct_answers.append("Non teño ningún comentario.")
+    return {
+        "question": examples["question"].strip(),
+        "correct_answers": correct_answers,
+        "incorrect_answers": incorrect_answers,
+    }
+
+
+def process_doc_nli(dataset):
+    def process_fn(doc):
+        # Detokenize(remove extra whitespaces)
+        doc["sentence1"] = general_detokenize(doc["sentence1"]).strip()
+        doc["sentence2"] = general_detokenize(doc["sentence2"]).strip()
+	    # Remove last punctuation mark in the sentence1
+        doc["sentence1"] = doc["sentence1"][:-1] if doc["sentence1"].endswith((".", ",", "!", "?")) else doc["sentence1"]
+	    # Lowercase the first letter in the sentence2
+        doc["sentence2"] = lowercase_first_letter(doc["sentence2"])
+	    # Ensure that the sentence2 ends with a dot
+        doc["sentence2"] = (doc["sentence2"] + ".") if not doc["sentence2"].endswith(".") else doc["sentence2"]
+        # map label names to int
+        label_to_int = {"entailment": 0, "neutral": 1, "contradiction": 2}
+        doc["gold_label"] = label_to_int[doc["gold_label"]]
+        return doc
+    return dataset.map(process_fn)
+    
+def process_results_gen(doc, results):
+    completion = results[0]
+    true_refs, false_refs = doc["correct_answers"], doc["incorrect_answers"]
+    all_refs = true_refs + false_refs
+
+    # Process the sentence-level BLEURT, BLEU, and ROUGE for similarity measures.
+
+    # # BLEURT
+    # bleurt_scores_true = self.bleurt.compute(
+    #     predictions=[completion] * len(true_refs), references=true_refs
+    # )["scores"]
+    # bleurt_scores_false = self.bleurt.compute(
+    #     predictions=[completion] * len(false_refs), references=false_refs
+    # )["scores"]
+    # bleurt_correct = max(bleurt_scores_true)
+    # bleurt_incorrect = max(bleurt_scores_false)
+    # bleurt_max = bleurt_correct
+    # bleurt_diff = bleurt_correct - bleurt_incorrect
+    # bleurt_acc = int(bleurt_correct > bleurt_incorrect)
+
+    # BLEU
+    bleu_scores = [bleu([[ref]], [completion]) for ref in all_refs]
+    bleu_correct = np.nanmax(bleu_scores[: len(true_refs)])
+    bleu_incorrect = np.nanmax(bleu_scores[len(true_refs) :])
+    bleu_max = bleu_correct
+    bleu_diff = bleu_correct - bleu_incorrect
+    bleu_acc = int(bleu_correct > bleu_incorrect)
+
+    # ROUGE-N
+    rouge_scores = [rouge([ref], [completion]) for ref in all_refs]
+    # ROUGE-1
+    rouge1_scores = [score["rouge1"] for score in rouge_scores]
+    rouge1_correct = np.nanmax(rouge1_scores[: len(true_refs)])
+    rouge1_incorrect = np.nanmax(rouge1_scores[len(true_refs) :])
+    rouge1_max = rouge1_correct
+    rouge1_diff = rouge1_correct - rouge1_incorrect
+    rouge1_acc = int(rouge1_correct > rouge1_incorrect)
+    # ROUGE-2
+    rouge2_scores = [score["rouge2"] for score in rouge_scores]
+    rouge2_correct = np.nanmax(rouge2_scores[: len(true_refs)])
+    rouge2_incorrect = np.nanmax(rouge2_scores[len(true_refs) :])
+    rouge2_max = rouge2_correct
+    rouge2_diff = rouge2_correct - rouge2_incorrect
+    rouge2_acc = int(rouge2_correct > rouge2_incorrect)
+    # ROUGE-L
+    rougeL_scores = [score["rougeLsum"] for score in rouge_scores]
+    rougeL_correct = np.nanmax(rougeL_scores[: len(true_refs)])
+    rougeL_incorrect = np.nanmax(rougeL_scores[len(true_refs) :])
+    rougeL_max = rougeL_correct
+    rougeL_diff = rougeL_correct - rougeL_incorrect
+    rougeL_acc = int(rougeL_correct > rougeL_incorrect)
+
+    return {
+        # "bleurt_max": bleurt_max,
+        # "bleurt_acc": bleurt_acc,
+        # "bleurt_diff": bleurt_diff,
+        "bleu_max": bleu_max,
+        "bleu_acc": bleu_acc,
+        "bleu_diff": bleu_diff,
+        "rouge1_max": rouge1_max,
+        "rouge1_acc": rouge1_acc,
+        "rouge1_diff": rouge1_diff,
+        "rouge2_max": rouge2_max,
+        "rouge2_acc": rouge2_acc,
+        "rouge2_diff": rouge2_diff,
+        "rougeL_max": rougeL_max,
+        "rougeL_acc": rougeL_acc,
+        "rougeL_diff": rougeL_diff,
+    }
+
+
+def bleu(refs, preds):
+    """
+    Returns `t5` style BLEU scores. See the related implementation:
+    https://github.com/google-research/text-to-text-transfer-transformer/blob/3d10afd51ba97ac29eb66ae701eca274488202f7/t5/evaluation/metrics.py#L41
+
+    :param refs:
+        A `list` of `list` of reference `str`s.
+    :param preds:
+        A `list` of predicted `str`s.
+    """
+    score = sacrebleu.corpus_bleu(
+        preds,
+        refs,
+        smooth_method="exp",
+        smooth_value=0.0,
+        force=False,
+        lowercase=False,
+        tokenize="intl",
+        use_effective_order=False,
+    ).score
+    return score
+
+
+def rouge(refs, preds):
+    """
+    Returns `t5` style ROUGE scores. See the related implementation:
+    https://github.com/google-research/text-to-text-transfer-transformer/blob/3d10afd51ba97ac29eb66ae701eca274488202f7/t5/evaluation/metrics.py#L68
+
+    :param refs:
+        A `list` of reference `strs`.
+    :param preds:
+        A `list` of predicted `strs`.
+    """
+    rouge_types = ["rouge1", "rouge2", "rougeLsum"]
+    scorer = rouge_scorer.RougeScorer(rouge_types)
+    # Add newlines between sentences to correctly compute `rougeLsum`.
+
+    def _prepare_summary(summary):
+        summary = summary.replace(" . ", ".\n")
+        return summary
+
+    # Accumulate confidence intervals.
+    aggregator = scoring.BootstrapAggregator()
+    for ref, pred in zip(refs, preds):
+        ref = _prepare_summary(ref)
+        pred = _prepare_summary(pred)
+        aggregator.add_scores(scorer.score(ref, pred))
+    result = aggregator.aggregate()
+    return {type: result[type].mid.fmeasure * 100 for type in rouge_types}
+
diff --git a/lm_eval/tasks/galician_bench/xnli_gl.yaml b/lm_eval/tasks/galician_bench/xnli_gl.yaml
new file mode 100644
index 0000000000..f7d74316f4
--- /dev/null
+++ b/lm_eval/tasks/galician_bench/xnli_gl.yaml
@@ -0,0 +1,22 @@
+group:
+    - xnli
+task: xnli_gl
+dataset_path: proxectonos/xnli-gl
+dataset_name: null
+include: ../xnli/xnli_common_yaml
+output_type: multiple_choice
+doc_to_choice: '{{[sentence1+", verdadeiro? Si, "+sentence2,sentence1+", verdadeiro? Ademais,
+  "+sentence2,sentence1+", verdadeiro? Non, "+sentence2]}}'
+doc_to_text: ''
+target_delimiter: ''
+process_docs: !function utils.process_doc_nli
+training_split: null
+validation_split: null
+test_split: test
+doc_to_target: gold_label
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+metadata:
+  version: 1.0
diff --git a/lm_eval/tasks/galician_bench/xstorycloze_gl.yaml b/lm_eval/tasks/galician_bench/xstorycloze_gl.yaml
new file mode 100644
index 0000000000..1c3b79d423
--- /dev/null
+++ b/lm_eval/tasks/galician_bench/xstorycloze_gl.yaml
@@ -0,0 +1,16 @@
+task: xstorycloze_gl
+dataset_path: proxectonos/xstorycloze_gl
+output_type: multiple_choice
+training_split: train
+validation_split: test
+doc_to_text: "{{[InputSentence1, InputSentence2, InputSentence3, InputSentence4]|join(' ')}}"
+doc_to_target: "{{AnswerRightEnding-1}}"
+doc_to_choice: "{{[RandomFifthSentenceQuiz1, RandomFifthSentenceQuiz2]}}"
+should_decontaminate: true
+doc_to_decontamination_query: "{{[InputSentence1, InputSentence2, InputSentence3, InputSentence4]|join(' ')}}"
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+metadata:
+  version: 1.0

From 619b4f445d2477028a1384ffd8be576c20af5869 Mon Sep 17 00:00:00 2001
From: zxcvuser <irene.bauce@gmail.com>
Date: Mon, 22 Jul 2024 10:48:47 +0200
Subject: [PATCH 2/5] Update xnli_gl path

---
 lm_eval/tasks/galician_bench/xnli_gl.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lm_eval/tasks/galician_bench/xnli_gl.yaml b/lm_eval/tasks/galician_bench/xnli_gl.yaml
index f7d74316f4..257d7e7b50 100644
--- a/lm_eval/tasks/galician_bench/xnli_gl.yaml
+++ b/lm_eval/tasks/galician_bench/xnli_gl.yaml
@@ -1,7 +1,7 @@
 group:
     - xnli
 task: xnli_gl
-dataset_path: proxectonos/xnli-gl
+dataset_path: proxectonos/xnli_gl
 dataset_name: null
 include: ../xnli/xnli_common_yaml
 output_type: multiple_choice

From ad9571b42258feb84e4040602a0ea7c52c23d987 Mon Sep 17 00:00:00 2001
From: zxcvuser <irene.bauce@gmail.com>
Date: Tue, 30 Jul 2024 09:35:11 +0200
Subject: [PATCH 3/5] Add flores_gl group

---
 .../flores_gl/create-yamls_flores_gl.py       |  2 +-
 .../flores_gl/flores_ca-gl.yaml               |  1 -
 .../flores_gl/flores_de-gl.yaml               |  1 -
 .../flores_gl/flores_en-gl.yaml               |  1 -
 .../flores_gl/flores_es-gl.yaml               |  1 -
 .../flores_gl/flores_eu-gl.yaml               |  1 -
 .../flores_gl/flores_fr-gl.yaml               |  1 -
 .../flores_gl/flores_gl-ca.yaml               |  1 -
 .../flores_gl/flores_gl-de.yaml               |  1 -
 .../flores_gl/flores_gl-en.yaml               |  1 -
 .../flores_gl/flores_gl-es.yaml               |  1 -
 .../flores_gl/flores_gl-eu.yaml               |  1 -
 .../flores_gl/flores_gl-fr.yaml               |  1 -
 .../flores_gl/flores_gl-it.yaml               |  1 -
 .../flores_gl/flores_gl-pt.yaml               |  1 -
 .../galician_bench/flores_gl/flores_gl.yaml   | 23 +++++++++++++++++++
 .../flores_gl/flores_it-gl.yaml               |  1 -
 .../flores_gl/flores_pt-gl.yaml               |  1 -
 18 files changed, 24 insertions(+), 17 deletions(-)
 create mode 100644 lm_eval/tasks/galician_bench/flores_gl/flores_gl.yaml

diff --git a/lm_eval/tasks/galician_bench/flores_gl/create-yamls_flores_gl.py b/lm_eval/tasks/galician_bench/flores_gl/create-yamls_flores_gl.py
index 7d805de62a..ba8be8868f 100644
--- a/lm_eval/tasks/galician_bench/flores_gl/create-yamls_flores_gl.py
+++ b/lm_eval/tasks/galician_bench/flores_gl/create-yamls_flores_gl.py
@@ -82,7 +82,7 @@ def gen_lang_yamls(output_dir: str, overwrite: bool) -> None:
                     yaml.dump(
                         {
 #                             "group": [f"{BENCH_NAME}_bench", f"{BENCH_NAME}_bench_flores"],
-                            "group": "flores_gl",
+#                            "group": "flores_gl",
                             "include": "_flores_common_yaml",
                             "task": f"flores_{lang_pair_name}",
                             "doc_to_text": doc_to_text(src, tgt),
diff --git a/lm_eval/tasks/galician_bench/flores_gl/flores_ca-gl.yaml b/lm_eval/tasks/galician_bench/flores_gl/flores_ca-gl.yaml
index fc237527d1..5da7ad5fe4 100644
--- a/lm_eval/tasks/galician_bench/flores_gl/flores_ca-gl.yaml
+++ b/lm_eval/tasks/galician_bench/flores_gl/flores_ca-gl.yaml
@@ -1,5 +1,4 @@
 # File generated by `create-yamls.py`
-group: flores_gl
 include: _flores_common_yaml
 task: flores_ca-gl
 doc_to_text: 'Catalan sentence: {{sentence_cat_Latn}}
diff --git a/lm_eval/tasks/galician_bench/flores_gl/flores_de-gl.yaml b/lm_eval/tasks/galician_bench/flores_gl/flores_de-gl.yaml
index 4217ff625e..2f2eabbc55 100644
--- a/lm_eval/tasks/galician_bench/flores_gl/flores_de-gl.yaml
+++ b/lm_eval/tasks/galician_bench/flores_gl/flores_de-gl.yaml
@@ -1,5 +1,4 @@
 # File generated by `create-yamls.py`
-group: flores_gl
 include: _flores_common_yaml
 task: flores_de-gl
 doc_to_text: 'German sentence: {{sentence_deu_Latn}}
diff --git a/lm_eval/tasks/galician_bench/flores_gl/flores_en-gl.yaml b/lm_eval/tasks/galician_bench/flores_gl/flores_en-gl.yaml
index f0cff50781..9dc8fc24f1 100644
--- a/lm_eval/tasks/galician_bench/flores_gl/flores_en-gl.yaml
+++ b/lm_eval/tasks/galician_bench/flores_gl/flores_en-gl.yaml
@@ -1,5 +1,4 @@
 # File generated by `create-yamls.py`
-group: flores_gl
 include: _flores_common_yaml
 task: flores_en-gl
 doc_to_text: 'English sentence: {{sentence_eng_Latn}}
diff --git a/lm_eval/tasks/galician_bench/flores_gl/flores_es-gl.yaml b/lm_eval/tasks/galician_bench/flores_gl/flores_es-gl.yaml
index b41b13889f..dd3c6a9eac 100644
--- a/lm_eval/tasks/galician_bench/flores_gl/flores_es-gl.yaml
+++ b/lm_eval/tasks/galician_bench/flores_gl/flores_es-gl.yaml
@@ -1,5 +1,4 @@
 # File generated by `create-yamls.py`
-group: flores_gl
 include: _flores_common_yaml
 task: flores_es-gl
 doc_to_text: 'Spanish sentence: {{sentence_spa_Latn}}
diff --git a/lm_eval/tasks/galician_bench/flores_gl/flores_eu-gl.yaml b/lm_eval/tasks/galician_bench/flores_gl/flores_eu-gl.yaml
index d4963e73e0..db762cf75c 100644
--- a/lm_eval/tasks/galician_bench/flores_gl/flores_eu-gl.yaml
+++ b/lm_eval/tasks/galician_bench/flores_gl/flores_eu-gl.yaml
@@ -1,5 +1,4 @@
 # File generated by `create-yamls.py`
-group: flores_gl
 include: _flores_common_yaml
 task: flores_eu-gl
 doc_to_text: 'Basque sentence: {{sentence_eus_Latn}}
diff --git a/lm_eval/tasks/galician_bench/flores_gl/flores_fr-gl.yaml b/lm_eval/tasks/galician_bench/flores_gl/flores_fr-gl.yaml
index 61d810196a..0d884dbad7 100644
--- a/lm_eval/tasks/galician_bench/flores_gl/flores_fr-gl.yaml
+++ b/lm_eval/tasks/galician_bench/flores_gl/flores_fr-gl.yaml
@@ -1,5 +1,4 @@
 # File generated by `create-yamls.py`
-group: flores_gl
 include: _flores_common_yaml
 task: flores_fr-gl
 doc_to_text: 'French sentence: {{sentence_fra_Latn}}
diff --git a/lm_eval/tasks/galician_bench/flores_gl/flores_gl-ca.yaml b/lm_eval/tasks/galician_bench/flores_gl/flores_gl-ca.yaml
index e17f7748d8..6ce3eaae5c 100644
--- a/lm_eval/tasks/galician_bench/flores_gl/flores_gl-ca.yaml
+++ b/lm_eval/tasks/galician_bench/flores_gl/flores_gl-ca.yaml
@@ -1,5 +1,4 @@
 # File generated by `create-yamls.py`
-group: flores_gl
 include: _flores_common_yaml
 task: flores_gl-ca
 doc_to_text: 'Galician sentence: {{sentence_glg_Latn}}
diff --git a/lm_eval/tasks/galician_bench/flores_gl/flores_gl-de.yaml b/lm_eval/tasks/galician_bench/flores_gl/flores_gl-de.yaml
index 8d396c1c3a..e499780fbb 100644
--- a/lm_eval/tasks/galician_bench/flores_gl/flores_gl-de.yaml
+++ b/lm_eval/tasks/galician_bench/flores_gl/flores_gl-de.yaml
@@ -1,5 +1,4 @@
 # File generated by `create-yamls.py`
-group: flores_gl
 include: _flores_common_yaml
 task: flores_gl-de
 doc_to_text: 'Galician sentence: {{sentence_glg_Latn}}
diff --git a/lm_eval/tasks/galician_bench/flores_gl/flores_gl-en.yaml b/lm_eval/tasks/galician_bench/flores_gl/flores_gl-en.yaml
index 679de2e9ef..5d2b7afbd9 100644
--- a/lm_eval/tasks/galician_bench/flores_gl/flores_gl-en.yaml
+++ b/lm_eval/tasks/galician_bench/flores_gl/flores_gl-en.yaml
@@ -1,5 +1,4 @@
 # File generated by `create-yamls.py`
-group: flores_gl
 include: _flores_common_yaml
 task: flores_gl-en
 doc_to_text: 'Galician sentence: {{sentence_glg_Latn}}
diff --git a/lm_eval/tasks/galician_bench/flores_gl/flores_gl-es.yaml b/lm_eval/tasks/galician_bench/flores_gl/flores_gl-es.yaml
index 8192757472..c00acf3f47 100644
--- a/lm_eval/tasks/galician_bench/flores_gl/flores_gl-es.yaml
+++ b/lm_eval/tasks/galician_bench/flores_gl/flores_gl-es.yaml
@@ -1,5 +1,4 @@
 # File generated by `create-yamls.py`
-group: flores_gl
 include: _flores_common_yaml
 task: flores_gl-es
 doc_to_text: 'Galician sentence: {{sentence_glg_Latn}}
diff --git a/lm_eval/tasks/galician_bench/flores_gl/flores_gl-eu.yaml b/lm_eval/tasks/galician_bench/flores_gl/flores_gl-eu.yaml
index 0c3f7563ab..08fafe084a 100644
--- a/lm_eval/tasks/galician_bench/flores_gl/flores_gl-eu.yaml
+++ b/lm_eval/tasks/galician_bench/flores_gl/flores_gl-eu.yaml
@@ -1,5 +1,4 @@
 # File generated by `create-yamls.py`
-group: flores_gl
 include: _flores_common_yaml
 task: flores_gl-eu
 doc_to_text: 'Galician sentence: {{sentence_glg_Latn}}
diff --git a/lm_eval/tasks/galician_bench/flores_gl/flores_gl-fr.yaml b/lm_eval/tasks/galician_bench/flores_gl/flores_gl-fr.yaml
index 99ef11bda9..14b060b25f 100644
--- a/lm_eval/tasks/galician_bench/flores_gl/flores_gl-fr.yaml
+++ b/lm_eval/tasks/galician_bench/flores_gl/flores_gl-fr.yaml
@@ -1,5 +1,4 @@
 # File generated by `create-yamls.py`
-group: flores_gl
 include: _flores_common_yaml
 task: flores_gl-fr
 doc_to_text: 'Galician sentence: {{sentence_glg_Latn}}
diff --git a/lm_eval/tasks/galician_bench/flores_gl/flores_gl-it.yaml b/lm_eval/tasks/galician_bench/flores_gl/flores_gl-it.yaml
index ce7203fd06..74a01b8854 100644
--- a/lm_eval/tasks/galician_bench/flores_gl/flores_gl-it.yaml
+++ b/lm_eval/tasks/galician_bench/flores_gl/flores_gl-it.yaml
@@ -1,5 +1,4 @@
 # File generated by `create-yamls.py`
-group: flores_gl
 include: _flores_common_yaml
 task: flores_gl-it
 doc_to_text: 'Galician sentence: {{sentence_glg_Latn}}
diff --git a/lm_eval/tasks/galician_bench/flores_gl/flores_gl-pt.yaml b/lm_eval/tasks/galician_bench/flores_gl/flores_gl-pt.yaml
index 5d80286f87..e965a34776 100644
--- a/lm_eval/tasks/galician_bench/flores_gl/flores_gl-pt.yaml
+++ b/lm_eval/tasks/galician_bench/flores_gl/flores_gl-pt.yaml
@@ -1,5 +1,4 @@
 # File generated by `create-yamls.py`
-group: flores_gl
 include: _flores_common_yaml
 task: flores_gl-pt
 doc_to_text: 'Galician sentence: {{sentence_glg_Latn}}
diff --git a/lm_eval/tasks/galician_bench/flores_gl/flores_gl.yaml b/lm_eval/tasks/galician_bench/flores_gl/flores_gl.yaml
new file mode 100644
index 0000000000..828392437d
--- /dev/null
+++ b/lm_eval/tasks/galician_bench/flores_gl/flores_gl.yaml
@@ -0,0 +1,23 @@
+group: flores_gl
+task:
+  - flores_es-gl
+  - flores_gl-es
+  - flores_en-gl
+  - flores_gl-en
+  - flores_eu-gl
+  - flores_gl-eu
+  - flores_pt-gl
+  - flores_gl-pt
+  - flores_it-gl
+  - flores_gl-it
+  - flores_fr-gl
+  - flores_gl-fr
+  - flores_ca-gl
+  - flores_gl-ca
+  - flores_gl-de
+  - flores_de-gl
+aggregate_metric_list:
+  - metric: bleu
+    aggregation: mean
+metadata:
+  version: 1.0
diff --git a/lm_eval/tasks/galician_bench/flores_gl/flores_it-gl.yaml b/lm_eval/tasks/galician_bench/flores_gl/flores_it-gl.yaml
index bc69cc77ea..7c85a09c9e 100644
--- a/lm_eval/tasks/galician_bench/flores_gl/flores_it-gl.yaml
+++ b/lm_eval/tasks/galician_bench/flores_gl/flores_it-gl.yaml
@@ -1,5 +1,4 @@
 # File generated by `create-yamls.py`
-group: flores_gl
 include: _flores_common_yaml
 task: flores_it-gl
 doc_to_text: 'Italian sentence: {{sentence_ita_Latn}}
diff --git a/lm_eval/tasks/galician_bench/flores_gl/flores_pt-gl.yaml b/lm_eval/tasks/galician_bench/flores_gl/flores_pt-gl.yaml
index 4dea2e0e79..5371f51062 100644
--- a/lm_eval/tasks/galician_bench/flores_gl/flores_pt-gl.yaml
+++ b/lm_eval/tasks/galician_bench/flores_gl/flores_pt-gl.yaml
@@ -1,5 +1,4 @@
 # File generated by `create-yamls.py`
-group: flores_gl
 include: _flores_common_yaml
 task: flores_pt-gl
 doc_to_text: 'Portuguese sentence: {{sentence_por_Latn}}

From 525238e2da6fad62b20b9a9eb861fd760a090c6e Mon Sep 17 00:00:00 2001
From: zxcvuser <irene.bauce@gmail.com>
Date: Tue, 30 Jul 2024 10:23:48 +0200
Subject: [PATCH 4/5] Update _flores_common_yaml

---
 lm_eval/tasks/galician_bench/flores_gl/_flores_common_yaml | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/lm_eval/tasks/galician_bench/flores_gl/_flores_common_yaml b/lm_eval/tasks/galician_bench/flores_gl/_flores_common_yaml
index e7b7ea6bac..ada4c55c29 100644
--- a/lm_eval/tasks/galician_bench/flores_gl/_flores_common_yaml
+++ b/lm_eval/tasks/galician_bench/flores_gl/_flores_common_yaml
@@ -23,4 +23,6 @@ metric_list:
     aggregation: chrf
     higher_is_better: true
 metadata:
-  version: 1.0
\ No newline at end of file
+  version: 1.0
+dataset_kwargs:
+  trust_remote_code: true

From b31a9a0302b1bfb58ee2da73adcde77feac145df Mon Sep 17 00:00:00 2001
From: zxcvuser <irene.bauce@gmail.com>
Date: Fri, 27 Sep 2024 17:58:46 +0200
Subject: [PATCH 5/5] Updated some task groupings and readme

---
 lm_eval/tasks/README.md                       |  1 +
 .../galician_bench/belebele_glg_Latn.yaml     |  2 -
 ...flores_gl.py => create_yamls_flores_gl.py} |  0
 .../galician_bench/flores_gl/flores_gl.yaml   |  1 +
 .../tasks/galician_bench/mgsm_direct_gl.yaml  |  2 -
 lm_eval/tasks/galician_bench/paws_gl.yaml     |  4 +-
 .../galician_bench/summarization_gl.yaml      |  2 +-
 .../galician_bench/truthfulqa_gl_gen.yaml     |  3 +-
 .../galician_bench/truthfulqa_gl_mc1.yaml     |  3 +-
 .../galician_bench/truthfulqa_gl_mc2.yaml     |  3 +-
 lm_eval/tasks/galician_bench/utils.py         | 63 +++++++++++++------
 lm_eval/tasks/galician_bench/xnli_gl.yaml     |  2 -
 12 files changed, 52 insertions(+), 34 deletions(-)
 rename lm_eval/tasks/galician_bench/flores_gl/{create-yamls_flores_gl.py => create_yamls_flores_gl.py} (100%)

diff --git a/lm_eval/tasks/README.md b/lm_eval/tasks/README.md
index 3903db98e8..7c8a74001d 100644
--- a/lm_eval/tasks/README.md
+++ b/lm_eval/tasks/README.md
@@ -121,3 +121,4 @@
 | [xnli_eu](xnli_eu/README.md) | Cross-lingual Natural Language Inference tasks in Basque. | Basque |
 | [xstorycloze](xstorycloze/README.md) | Cross-lingual narrative understanding tasks to predict story endings in multiple languages. | Russian, Simplified Chinese, Spanish, Arabic, Hindi, Indonesian, Telugu, Swahili, Basque, Burmese |
 | [xwinograd](xwinograd/README.md) | Cross-lingual Winograd schema tasks for coreference resolution in multiple languages. | English, French, Japanese, Portuguese, Russian, Chinese |
+| [galician_bench](galician_bench/README.md) | Collection of tasks in Galician encompassing various evaluation areas. | Galician |
diff --git a/lm_eval/tasks/galician_bench/belebele_glg_Latn.yaml b/lm_eval/tasks/galician_bench/belebele_glg_Latn.yaml
index 80aea572d7..ae81a53f9c 100644
--- a/lm_eval/tasks/galician_bench/belebele_glg_Latn.yaml
+++ b/lm_eval/tasks/galician_bench/belebele_glg_Latn.yaml
@@ -1,5 +1,3 @@
-group:
-    - belebele
 task: belebele_glg_Latn
 include: ../belebele/_default_template_yaml
 dataset_path: proxectonos/belebele_gl
diff --git a/lm_eval/tasks/galician_bench/flores_gl/create-yamls_flores_gl.py b/lm_eval/tasks/galician_bench/flores_gl/create_yamls_flores_gl.py
similarity index 100%
rename from lm_eval/tasks/galician_bench/flores_gl/create-yamls_flores_gl.py
rename to lm_eval/tasks/galician_bench/flores_gl/create_yamls_flores_gl.py
diff --git a/lm_eval/tasks/galician_bench/flores_gl/flores_gl.yaml b/lm_eval/tasks/galician_bench/flores_gl/flores_gl.yaml
index 828392437d..806739a9df 100644
--- a/lm_eval/tasks/galician_bench/flores_gl/flores_gl.yaml
+++ b/lm_eval/tasks/galician_bench/flores_gl/flores_gl.yaml
@@ -19,5 +19,6 @@ task:
 aggregate_metric_list:
   - metric: bleu
     aggregation: mean
+    weight_by_size: false
 metadata:
   version: 1.0
diff --git a/lm_eval/tasks/galician_bench/mgsm_direct_gl.yaml b/lm_eval/tasks/galician_bench/mgsm_direct_gl.yaml
index d9d8ca3bb1..f01be3e45e 100644
--- a/lm_eval/tasks/galician_bench/mgsm_direct_gl.yaml
+++ b/lm_eval/tasks/galician_bench/mgsm_direct_gl.yaml
@@ -1,5 +1,3 @@
-group:
-    - mgsm_direct
 task: mgsm_direct_gl
 dataset_path: proxectonos/mgsm_gl
 doc_to_target: '{{answer_number|string}}'
diff --git a/lm_eval/tasks/galician_bench/paws_gl.yaml b/lm_eval/tasks/galician_bench/paws_gl.yaml
index ee28afdbfc..eeb4f5f568 100644
--- a/lm_eval/tasks/galician_bench/paws_gl.yaml
+++ b/lm_eval/tasks/galician_bench/paws_gl.yaml
@@ -1,5 +1,3 @@
-group:
-    - pawsx
 task: paws_gl
 dataset_path: proxectonos/PAWS-gl
 dataset_name: null
@@ -17,4 +15,4 @@ metric_list:
     aggregation: mean
     higher_is_better: true
 metadata:
-  version: 1.0
\ No newline at end of file
+  version: 1.0
diff --git a/lm_eval/tasks/galician_bench/summarization_gl.yaml b/lm_eval/tasks/galician_bench/summarization_gl.yaml
index 3cefff62d4..93d1a4d97b 100644
--- a/lm_eval/tasks/galician_bench/summarization_gl.yaml
+++ b/lm_eval/tasks/galician_bench/summarization_gl.yaml
@@ -18,4 +18,4 @@ metric_list:
     aggregation: !function utils.rouge1_agg
     higher_is_better: true
 metadata:
-  version: 1.0
\ No newline at end of file
+  version: 1.0
diff --git a/lm_eval/tasks/galician_bench/truthfulqa_gl_gen.yaml b/lm_eval/tasks/galician_bench/truthfulqa_gl_gen.yaml
index 2d2fa18767..b8ac9c2239 100644
--- a/lm_eval/tasks/galician_bench/truthfulqa_gl_gen.yaml
+++ b/lm_eval/tasks/galician_bench/truthfulqa_gl_gen.yaml
@@ -1,5 +1,4 @@
-group:
-  - truthfulqa_gl
+tag: truthfulqa_gl
 task: truthfulqa_gl_gen
 dataset_path: proxectonos/truthfulqa_gl
 dataset_name: generation
diff --git a/lm_eval/tasks/galician_bench/truthfulqa_gl_mc1.yaml b/lm_eval/tasks/galician_bench/truthfulqa_gl_mc1.yaml
index beccf24e9f..b4835661ee 100644
--- a/lm_eval/tasks/galician_bench/truthfulqa_gl_mc1.yaml
+++ b/lm_eval/tasks/galician_bench/truthfulqa_gl_mc1.yaml
@@ -1,5 +1,4 @@
-group:
-  - truthfulqa_gl
+tag: truthfulqa_gl
 task: truthfulqa_gl_mc1
 dataset_path: proxectonos/truthfulqa_gl
 dataset_name: multiple_choice
diff --git a/lm_eval/tasks/galician_bench/truthfulqa_gl_mc2.yaml b/lm_eval/tasks/galician_bench/truthfulqa_gl_mc2.yaml
index e1f91867a7..08c4bd6a9a 100644
--- a/lm_eval/tasks/galician_bench/truthfulqa_gl_mc2.yaml
+++ b/lm_eval/tasks/galician_bench/truthfulqa_gl_mc2.yaml
@@ -1,5 +1,4 @@
-group:                                              
-  - truthfulqa_gl
+tag: truthfulqa_gl
 include: truthfulqa_gl_mc1.yaml
 task: truthfulqa_gl_mc2
 doc_to_target: 0
diff --git a/lm_eval/tasks/galician_bench/utils.py b/lm_eval/tasks/galician_bench/utils.py
index b10043567e..67b0cf69e0 100644
--- a/lm_eval/tasks/galician_bench/utils.py
+++ b/lm_eval/tasks/galician_bench/utils.py
@@ -1,16 +1,19 @@
 import re
 from itertools import product
-import evaluate
-import transformers.data.metrics.squad_metrics as squad_metrics
-from lm_eval.utils import general_detokenize
+
 import datasets
+import evaluate
 import numpy as np
 import sacrebleu
+import transformers.data.metrics.squad_metrics as squad_metrics
 from rouge_score import rouge_scorer, scoring
 
+from lm_eval.utils import general_detokenize
+
 
 def lowercase_first_letter(text):
-	return text[0].lower() + text[1:]
+    return text[0].lower() + text[1:]
+
 
 def process_summarization(dataset):
     def _process_doc(doc):
@@ -18,11 +21,13 @@ def _process_doc(doc):
         doc["text"] = re.sub(r" +", " ", doc["text"])
         doc["summary"] = re.sub(r" +", " ", doc["summary"])
         return doc
+
     return dataset.map(_process_doc)
 
 
 def process_docs_paraphrases(dataset):
     empty_docs = []
+
     def _process_doc(doc):
         if doc["Frase"] not in [None, ""] and doc["Paráfrase"] not in [None, ""]:
             doc["Frase"] = general_detokenize(doc["Frase"]).strip()
@@ -36,14 +41,21 @@ def _process_doc(doc):
         else:
             empty_docs.append(doc)
             return doc
+
     if empty_docs != []:
         len_empty_docs = len(empty_docs)
-        print(f"Found {len_empty_docs} empty documents out of the {len(dataset)} total docs in the dataset: {empty_docs}")
-    return dataset.filter(lambda doc: doc["Frase"] not in [None, ""] and doc["Paráfrase"] not in [None, ""]).map(_process_doc)
+        print(
+            f"Found {len_empty_docs} empty documents out of the {len(dataset)} total docs in the dataset: {empty_docs}"
+        )
+    return dataset.filter(
+        lambda doc: doc["Frase"] not in [None, ""]
+        and doc["Paráfrase"] not in [None, ""]
+    ).map(_process_doc)
 
 
 def process_docs_paws(dataset):
     empty_docs = []
+
     def _process_doc(doc):
         if doc["sentence1"] not in [None, ""] and doc["sentence2"] not in [None, ""]:
             doc["sentence1"] = general_detokenize(doc["sentence1"]).strip()
@@ -57,11 +69,16 @@ def _process_doc(doc):
         else:
             empty_docs.append(doc)
             return doc
+
     if empty_docs != []:
         len_empty_docs = len(empty_docs)
-        print(f"Found {len_empty_docs} empty documents out of the {len(dataset)} total docs in the dataset: {empty_docs}")
-    return dataset.filter(lambda doc: doc["sentence1"] not in [None, ""] and doc["sentence2"] not in [None, ""]).map(_process_doc)
-
+        print(
+            f"Found {len_empty_docs} empty documents out of the {len(dataset)} total docs in the dataset: {empty_docs}"
+        )
+    return dataset.filter(
+        lambda doc: doc["sentence1"] not in [None, ""]
+        and doc["sentence2"] not in [None, ""]
+    ).map(_process_doc)
 
 
 def rouge1(items):
@@ -78,9 +95,10 @@ def rouge1_agg(items):
     refs = list(zip(*items))[0]
     preds = list(zip(*items))[1]
     rouge_scorer = evaluate.load("rouge")
-    #import code; code.interact(local=dict(globals(), **locals()))
+    # import code; code.interact(local=dict(globals(), **locals()))
     return rouge_scorer.compute(predictions=preds, references=refs)["rouge1"]
-    
+
+
 def process_results_mc2(doc, results):
     lls, is_greedy = zip(*results)
 
@@ -127,18 +145,28 @@ def process_fn(doc):
         # Detokenize(remove extra whitespaces)
         doc["sentence1"] = general_detokenize(doc["sentence1"]).strip()
         doc["sentence2"] = general_detokenize(doc["sentence2"]).strip()
-	    # Remove last punctuation mark in the sentence1
-        doc["sentence1"] = doc["sentence1"][:-1] if doc["sentence1"].endswith((".", ",", "!", "?")) else doc["sentence1"]
-	    # Lowercase the first letter in the sentence2
+        # Remove last punctuation mark in the sentence1
+        doc["sentence1"] = (
+            doc["sentence1"][:-1]
+            if doc["sentence1"].endswith((".", ",", "!", "?"))
+            else doc["sentence1"]
+        )
+        # Lowercase the first letter in the sentence2
         doc["sentence2"] = lowercase_first_letter(doc["sentence2"])
-	    # Ensure that the sentence2 ends with a dot
-        doc["sentence2"] = (doc["sentence2"] + ".") if not doc["sentence2"].endswith(".") else doc["sentence2"]
+        # Ensure that the sentence2 ends with a dot
+        doc["sentence2"] = (
+            (doc["sentence2"] + ".")
+            if not doc["sentence2"].endswith(".")
+            else doc["sentence2"]
+        )
         # map label names to int
         label_to_int = {"entailment": 0, "neutral": 1, "contradiction": 2}
         doc["gold_label"] = label_to_int[doc["gold_label"]]
         return doc
+
     return dataset.map(process_fn)
-    
+
+
 def process_results_gen(doc, results):
     completion = results[0]
     true_refs, false_refs = doc["correct_answers"], doc["incorrect_answers"]
@@ -259,4 +287,3 @@ def _prepare_summary(summary):
         aggregator.add_scores(scorer.score(ref, pred))
     result = aggregator.aggregate()
     return {type: result[type].mid.fmeasure * 100 for type in rouge_types}
-
diff --git a/lm_eval/tasks/galician_bench/xnli_gl.yaml b/lm_eval/tasks/galician_bench/xnli_gl.yaml
index 257d7e7b50..c5e1b0fbca 100644
--- a/lm_eval/tasks/galician_bench/xnli_gl.yaml
+++ b/lm_eval/tasks/galician_bench/xnli_gl.yaml
@@ -1,5 +1,3 @@
-group:
-    - xnli
 task: xnli_gl
 dataset_path: proxectonos/xnli_gl
 dataset_name: null