From 869998bca77bc3bb63351b739111904dc1d70c20 Mon Sep 17 00:00:00 2001
From: zxcvuser <irene.bauce@gmail.com>
Date: Fri, 19 Jul 2024 14:17:01 +0200
Subject: [PATCH 1/4] Add catalan_bench

---
 lm_eval/tasks/catalan_bench/README.md         | 123 ++++++++++++++++++
 .../tasks/catalan_bench/_arc_ca_common_yaml   |  21 +++
 .../tasks/catalan_bench/_cabreu_common_yaml   |  17 +++
 .../tasks/catalan_bench/arc_ca_challenge.yaml |   3 +
 lm_eval/tasks/catalan_bench/arc_ca_easy.yaml  |   3 +
 .../catalan_bench/cabreu_abstractive.yaml     |   8 ++
 .../catalan_bench/cabreu_extractive.yaml      |   8 ++
 .../tasks/catalan_bench/cabreu_extreme.yaml   |   8 ++
 .../tasks/catalan_bench/catalan_bench.yaml    |  25 ++++
 lm_eval/tasks/catalan_bench/catalanqa.yaml    |  25 ++++
 lm_eval/tasks/catalan_bench/catcola.yaml      |  14 ++
 lm_eval/tasks/catalan_bench/copa_ca.yaml      |  17 +++
 lm_eval/tasks/catalan_bench/coqcat.yaml       |  23 ++++
 .../flores_ca/_flores_common_yaml             |  23 ++++
 .../flores_ca/create-yamls_flores_ca.py       | 115 ++++++++++++++++
 .../catalan_bench/flores_ca/flores_ca-de.yaml |   8 ++
 .../catalan_bench/flores_ca/flores_ca-en.yaml |   8 ++
 .../catalan_bench/flores_ca/flores_ca-es.yaml |   8 ++
 .../catalan_bench/flores_ca/flores_ca-eu.yaml |   8 ++
 .../catalan_bench/flores_ca/flores_ca-fr.yaml |   8 ++
 .../catalan_bench/flores_ca/flores_ca-gl.yaml |   8 ++
 .../catalan_bench/flores_ca/flores_ca-it.yaml |   8 ++
 .../catalan_bench/flores_ca/flores_ca-pt.yaml |   8 ++
 .../catalan_bench/flores_ca/flores_de-ca.yaml |   8 ++
 .../catalan_bench/flores_ca/flores_en-ca.yaml |   8 ++
 .../catalan_bench/flores_ca/flores_es-ca.yaml |   8 ++
 .../catalan_bench/flores_ca/flores_eu-ca.yaml |   8 ++
 .../catalan_bench/flores_ca/flores_fr-ca.yaml |   8 ++
 .../catalan_bench/flores_ca/flores_gl-ca.yaml |   8 ++
 .../catalan_bench/flores_ca/flores_it-ca.yaml |   8 ++
 .../catalan_bench/flores_ca/flores_pt-ca.yaml |   8 ++
 .../tasks/catalan_bench/mgsm_direct_ca.yaml   |  27 ++++
 .../tasks/catalan_bench/openbookqa_ca.yaml    |  20 +++
 lm_eval/tasks/catalan_bench/parafraseja.yaml  |  17 +++
 lm_eval/tasks/catalan_bench/paws_ca.yaml      |  20 +++
 .../phrases_va/_phrases_va_common.yaml        |  24 ++++
 .../phrases_va/phrases_ca-va.yaml             |   7 +
 .../phrases_va/phrases_va-ca.yaml             |   7 +
 lm_eval/tasks/catalan_bench/piqa_ca.yaml      |  21 +++
 lm_eval/tasks/catalan_bench/siqa_ca.yaml      |  16 +++
 lm_eval/tasks/catalan_bench/teca.yaml         |  18 +++
 lm_eval/tasks/catalan_bench/utils.py          | 122 +++++++++++++++++
 lm_eval/tasks/catalan_bench/wnli_ca.yaml      |  14 ++
 lm_eval/tasks/catalan_bench/xnli_ca.yaml      |  21 +++
 lm_eval/tasks/catalan_bench/xquad_ca.yaml     |  25 ++++
 .../tasks/catalan_bench/xstorycloze_ca.yaml   |  17 +++
 46 files changed, 937 insertions(+)
 create mode 100644 lm_eval/tasks/catalan_bench/README.md
 create mode 100644 lm_eval/tasks/catalan_bench/_arc_ca_common_yaml
 create mode 100644 lm_eval/tasks/catalan_bench/_cabreu_common_yaml
 create mode 100644 lm_eval/tasks/catalan_bench/arc_ca_challenge.yaml
 create mode 100644 lm_eval/tasks/catalan_bench/arc_ca_easy.yaml
 create mode 100644 lm_eval/tasks/catalan_bench/cabreu_abstractive.yaml
 create mode 100644 lm_eval/tasks/catalan_bench/cabreu_extractive.yaml
 create mode 100644 lm_eval/tasks/catalan_bench/cabreu_extreme.yaml
 create mode 100644 lm_eval/tasks/catalan_bench/catalan_bench.yaml
 create mode 100644 lm_eval/tasks/catalan_bench/catalanqa.yaml
 create mode 100644 lm_eval/tasks/catalan_bench/catcola.yaml
 create mode 100644 lm_eval/tasks/catalan_bench/copa_ca.yaml
 create mode 100644 lm_eval/tasks/catalan_bench/coqcat.yaml
 create mode 100644 lm_eval/tasks/catalan_bench/flores_ca/_flores_common_yaml
 create mode 100644 lm_eval/tasks/catalan_bench/flores_ca/create-yamls_flores_ca.py
 create mode 100644 lm_eval/tasks/catalan_bench/flores_ca/flores_ca-de.yaml
 create mode 100644 lm_eval/tasks/catalan_bench/flores_ca/flores_ca-en.yaml
 create mode 100644 lm_eval/tasks/catalan_bench/flores_ca/flores_ca-es.yaml
 create mode 100644 lm_eval/tasks/catalan_bench/flores_ca/flores_ca-eu.yaml
 create mode 100644 lm_eval/tasks/catalan_bench/flores_ca/flores_ca-fr.yaml
 create mode 100644 lm_eval/tasks/catalan_bench/flores_ca/flores_ca-gl.yaml
 create mode 100644 lm_eval/tasks/catalan_bench/flores_ca/flores_ca-it.yaml
 create mode 100644 lm_eval/tasks/catalan_bench/flores_ca/flores_ca-pt.yaml
 create mode 100644 lm_eval/tasks/catalan_bench/flores_ca/flores_de-ca.yaml
 create mode 100644 lm_eval/tasks/catalan_bench/flores_ca/flores_en-ca.yaml
 create mode 100644 lm_eval/tasks/catalan_bench/flores_ca/flores_es-ca.yaml
 create mode 100644 lm_eval/tasks/catalan_bench/flores_ca/flores_eu-ca.yaml
 create mode 100644 lm_eval/tasks/catalan_bench/flores_ca/flores_fr-ca.yaml
 create mode 100644 lm_eval/tasks/catalan_bench/flores_ca/flores_gl-ca.yaml
 create mode 100644 lm_eval/tasks/catalan_bench/flores_ca/flores_it-ca.yaml
 create mode 100644 lm_eval/tasks/catalan_bench/flores_ca/flores_pt-ca.yaml
 create mode 100644 lm_eval/tasks/catalan_bench/mgsm_direct_ca.yaml
 create mode 100644 lm_eval/tasks/catalan_bench/openbookqa_ca.yaml
 create mode 100644 lm_eval/tasks/catalan_bench/parafraseja.yaml
 create mode 100644 lm_eval/tasks/catalan_bench/paws_ca.yaml
 create mode 100644 lm_eval/tasks/catalan_bench/phrases_va/_phrases_va_common.yaml
 create mode 100644 lm_eval/tasks/catalan_bench/phrases_va/phrases_ca-va.yaml
 create mode 100644 lm_eval/tasks/catalan_bench/phrases_va/phrases_va-ca.yaml
 create mode 100644 lm_eval/tasks/catalan_bench/piqa_ca.yaml
 create mode 100644 lm_eval/tasks/catalan_bench/siqa_ca.yaml
 create mode 100644 lm_eval/tasks/catalan_bench/teca.yaml
 create mode 100644 lm_eval/tasks/catalan_bench/utils.py
 create mode 100644 lm_eval/tasks/catalan_bench/wnli_ca.yaml
 create mode 100644 lm_eval/tasks/catalan_bench/xnli_ca.yaml
 create mode 100644 lm_eval/tasks/catalan_bench/xquad_ca.yaml
 create mode 100644 lm_eval/tasks/catalan_bench/xstorycloze_ca.yaml

diff --git a/lm_eval/tasks/catalan_bench/README.md b/lm_eval/tasks/catalan_bench/README.md
new file mode 100644
index 0000000000..32b1b0fc34
--- /dev/null
+++ b/lm_eval/tasks/catalan_bench/README.md
@@ -0,0 +1,123 @@
+# CatalanBench
+
+### Paper
+
+CatalanBench is a benchmark for evaluating language models in Catalan tasks. This is, it evaluates the ability of a language model to understand and generate Catalan text. CatalanBench offers a combination of pre-existing, open datasets and datasets developed exclusivelly for this benchmark. All the details of CatalanBench will be published in a paper soon.
+
+The new evaluation datasets included in CatalanBench are:
+| Task          | Category       | Homepage  |
+|:-------------:|:-----:|:-----:|
+| ARC_ca | Question Answering | https://huggingface.co/datasets/projecte-aina/arc_ca |
+| MGSM_ca | Math | https://huggingface.co/datasets/projecte-aina/mgsm_ca |
+| OpenBookQA_ca | Question Answering | https://huggingface.co/datasets/projecte-aina/openbookqa_ca |
+| Parafraseja | Paraphrasing | https://huggingface.co/datasets/projecte-aina/Parafraseja |
+| PIQA_ca | Question Answering | https://huggingface.co/datasets/projecte-aina/piqa_ca |
+| SIQA_ca | Question Answering | https://huggingface.co/datasets/projecte-aina/siqa_ca |
+| XStoryCloze_ca | Commonsense Reasoning | https://huggingface.co/datasets/projecte-aina/xstorycloze_ca |
+
+The datasets included in CatalanBench that have been made public in previous pubications are:
+
+| Task          | Category       | Paper title          | Homepage  |
+|:-------------:|:-----:|:-------------:|:-----:|
+| Belebele_ca | Reading Comprehension | [The Belebele Benchmark: a Parallel Reading Comprehension Dataset in 122 Language Variants](https://arxiv.org/abs/2308.16884) | https://huggingface.co/datasets/facebook/belebele |
+| caBREU | Summarization | [Building a Data Infrastructure for a Mid-Resource Language: The Case of Catalan](https://aclanthology.org/2024.lrec-main.231/) | https://huggingface.co/datasets/projecte-aina/caBreu |
+| CatalanQA | Question Answering | [Building a Data Infrastructure for a Mid-Resource Language: The Case of Catalan](https://aclanthology.org/2024.lrec-main.231/) | https://huggingface.co/datasets/projecte-aina/catalanqa |
+| CatCoLA | Linguistic Acceptability | CatCoLA: Catalan Corpus of Linguistic Acceptability | https://huggingface.co/datasets/nbel/CatCoLA |
+| COPA-ca | Commonsense Reasoning | [Building a Data Infrastructure for a Mid-Resource Language: The Case of Catalan](https://aclanthology.org/2024.lrec-main.231/) | https://huggingface.co/datasets/projecte-aina/COPA-ca |
+| CoQCat | Question Answering | [Building a Data Infrastructure for a Mid-Resource Language: The Case of Catalan](https://aclanthology.org/2024.lrec-main.231/) | https://huggingface.co/datasets/projecte-aina/CoQCat |
+| FLORES_ca | Translation | [The FLORES-101  Evaluation Benchmark for Low-Resource and Multilingual Machine Translation](https://arxiv.org/abs/2106.03193) | https://huggingface.co/datasets/facebook/flores |
+| PAWS-ca | Paraphrasing | [Building a Data Infrastructure for a Mid-Resource Language: The Case of Catalan](https://aclanthology.org/2024.lrec-main.231/) | https://huggingface.co/datasets/projecte-aina/PAWS-ca |
+| TE-ca | Natural Language Inference | [Building a Data Infrastructure for a Mid-Resource Language: The Case of Catalan](https://aclanthology.org/2024.lrec-main.231/) | https://huggingface.co/datasets/projecte-aina/teca |
+| VeritasQA_ca | Truthfulness | VeritasQA: A Truthfulness Benchmark Aimed at Multilingual Transferability | TBA |
+| WNLI-ca | Natural Language Inference | [Building a Data Infrastructure for a Mid-Resource Language: The Case of Catalan](https://aclanthology.org/2024.lrec-main.231/) | https://huggingface.co/datasets/projecte-aina/wnli-ca |
+| XNLI-ca | Natural Language Inference | [Building a Data Infrastructure for a Mid-Resource Language: The Case of Catalan](https://aclanthology.org/2024.lrec-main.231/) | https://huggingface.co/datasets/projecte-aina/xnli-ca |
+| XQuAD-ca | Question Answering | [Building a Data Infrastructure for a Mid-Resource Language: The Case of Catalan](https://aclanthology.org/2024.lrec-main.231/) | https://huggingface.co/datasets/projecte-aina/xquad-ca |
+
+
+### Citation
+Paper for CatalanBench coming soon.
+
+<!--```bibtex
+@inproceedings{baucells-2024-iberobench,
+    title = "IberoBench: A Benchmark for LLM Evaluation in Iberian Languages",
+    author = "Baucells, Irene  and
+      AUTHORS, ADD",
+    booktitle = "Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing",
+    year = "2024",
+    publisher = "Association for Computational Linguistics",
+}
+```
+-->
+
+### Groups and Tasks
+
+#### Groups
+
+- `catalan_bench`: All tasks included in CatalanBench.
+- `cabreu`: Three CaBREU tasks for each type of summary (extractive, abstractive and extreme).
+- `flores_ca`: All FLORES translation tasks from or to Catalan.
+- `phrases_ca`: Two Phrases_va tasks for language adaptation between Catalan and Valencian.
+
+
+#### Tasks
+
+The following tasks evaluate tasks on CatalanBench dataset using various scoring methods.
+  - `arc_ca_challenge`
+  - `arc_ca_easy`
+  - `belebele_cat_Latn`
+  - `cabreu`
+  - `catalanqa`
+  - `catcola`
+  - `copa_ca`
+  - `coqcat`
+  - `flores_ca`
+  - `flores_ca-de`
+  - `flores_ca-en`
+  - `flores_ca-es`
+  - `flores_ca-eu`
+  - `flores_ca-fr`
+  - `flores_ca-gl`
+  - `flores_ca-it`
+  - `flores_ca-pt`
+  - `flores_de-ca`
+  - `flores_en-ca`
+  - `flores_es-ca`
+  - `flores_eu-ca`
+  - `flores_fr-ca`
+  - `flores_gl-ca`
+  - `flores_it-ca`
+  - `flores_pt-ca`
+  - `mgsm_direct_ca`
+  - `openbookqa_ca`
+  - `parafraseja`
+  - `paws_ca`
+  - `phrases_ca`
+  - `piqa_ca`
+  - `siqa_ca`
+  - `teca`
+  - `veritasqa_gen_ca`
+  - `veritasqa_mc1_ca`
+  - `veritasqa_mc2_ca`
+  - `wnli_ca`
+  - `xnli_ca`
+  - `xquad_ca`
+  - `xstorycloze_ca`
+
+Some of these tasks are taken from benchmarks already available in LM Evaluation Harness. These are:
+- `belebele_cat_Latn`: Belebele Catalan
+- `veritasqa_gen_ca`: VeritasQA Catalan
+- `veritasqa_mc1_ca`: VeritasQA Catalan
+- `veritasqa_mc2_ca`: VeritasQA Catalan
+
+
+### Checklist
+
+* [x] Is the task an existing benchmark in the literature?
+  * [ ] Have you referenced the original paper that introduced the task?
+  * [ ] If yes, does the original paper provide a reference implementation?
+    * [ ] Yes, original implementation contributed by author of the benchmark
+
+If other tasks on this dataset are already supported:
+* [ ] Is the "Main" variant of this task clearly denoted?
+* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates?
+* [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
diff --git a/lm_eval/tasks/catalan_bench/_arc_ca_common_yaml b/lm_eval/tasks/catalan_bench/_arc_ca_common_yaml
new file mode 100644
index 0000000000..38dfc08b46
--- /dev/null
+++ b/lm_eval/tasks/catalan_bench/_arc_ca_common_yaml
@@ -0,0 +1,21 @@
+group:
+  - ai2_arc
+dataset_path: projecte-aina/arc_ca
+output_type: multiple_choice
+training_split: null
+validation_split: validation
+test_split: test
+doc_to_text: "Pregunta: {{question}}\nResposta:"
+doc_to_target: "{{choices.label.index(answerKey)}}"
+doc_to_choice: "{{choices.text}}"
+should_decontaminate: true
+doc_to_decontamination_query: "Pregunta: {{question}}\nResposta:"
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+  - metric: acc_norm
+    aggregation: mean
+    higher_is_better: true
+metadata:
+  version: 1.0
diff --git a/lm_eval/tasks/catalan_bench/_cabreu_common_yaml b/lm_eval/tasks/catalan_bench/_cabreu_common_yaml
new file mode 100644
index 0000000000..85c51a18b6
--- /dev/null
+++ b/lm_eval/tasks/catalan_bench/_cabreu_common_yaml
@@ -0,0 +1,17 @@
+group: cabreu
+dataset_path: projecte-aina/caBreu
+dataset_name: null
+output_type: generate_until
+test_split: test
+training_split: train
+validation_split: validation
+process_docs: !function utils.process_doc_cabreu
+metric_list:
+  - metric: bleu
+    aggregation: bleu
+    higher_is_better: true
+  - metric: !function utils.rouge1
+    aggregation: !function utils.rouge1_agg
+    higher_is_better: true
+metadata:
+  version: 1.0
diff --git a/lm_eval/tasks/catalan_bench/arc_ca_challenge.yaml b/lm_eval/tasks/catalan_bench/arc_ca_challenge.yaml
new file mode 100644
index 0000000000..9d7a9c8423
--- /dev/null
+++ b/lm_eval/tasks/catalan_bench/arc_ca_challenge.yaml
@@ -0,0 +1,3 @@
+task: arc_ca_challenge
+dataset_name: ARC-Challenge
+include: _arc_ca_common_yaml
diff --git a/lm_eval/tasks/catalan_bench/arc_ca_easy.yaml b/lm_eval/tasks/catalan_bench/arc_ca_easy.yaml
new file mode 100644
index 0000000000..67b28fd626
--- /dev/null
+++ b/lm_eval/tasks/catalan_bench/arc_ca_easy.yaml
@@ -0,0 +1,3 @@
+task: arc_ca_easy
+dataset_name: ARC-Easy
+include: _arc_ca_common_yaml
diff --git a/lm_eval/tasks/catalan_bench/cabreu_abstractive.yaml b/lm_eval/tasks/catalan_bench/cabreu_abstractive.yaml
new file mode 100644
index 0000000000..930ba28a52
--- /dev/null
+++ b/lm_eval/tasks/catalan_bench/cabreu_abstractive.yaml
@@ -0,0 +1,8 @@
+include: _cabreu_common_yaml
+task: cabreu_abstractive
+description: "Examina el text següent i genera'n un resum abstractiu, expressant el significat del text original d'una manera més natural i concisa.\n"
+doc_to_text: >-
+  Text: {{content}}
+
+  Resum:
+doc_to_target: '{{summaries["abstractive"]["a1"]}}'
diff --git a/lm_eval/tasks/catalan_bench/cabreu_extractive.yaml b/lm_eval/tasks/catalan_bench/cabreu_extractive.yaml
new file mode 100644
index 0000000000..e5f3dd4dd0
--- /dev/null
+++ b/lm_eval/tasks/catalan_bench/cabreu_extractive.yaml
@@ -0,0 +1,8 @@
+include: _cabreu_common_yaml
+task: cabreu_extractive
+description: "Examina el text següent i genera'n un resum extractiu, utilitzant les frases o oracions més rellevants del text original.\n"
+doc_to_text: >-
+  Text: {{content}}
+
+  Resum:
+doc_to_target: '{{summaries["extractive"]["a1"]}}'
diff --git a/lm_eval/tasks/catalan_bench/cabreu_extreme.yaml b/lm_eval/tasks/catalan_bench/cabreu_extreme.yaml
new file mode 100644
index 0000000000..98efbe9cd4
--- /dev/null
+++ b/lm_eval/tasks/catalan_bench/cabreu_extreme.yaml
@@ -0,0 +1,8 @@
+include: _cabreu_common_yaml
+task: cabreu_extreme
+description: "Examina el text següent i genera'n un resum que sigui el més concís possible i que preservi el significat del text original.\n"
+doc_to_text: >-
+  Text: {{content}}
+
+  Resum:
+doc_to_target: '{{summaries["extreme"]["a1"]}}'
diff --git a/lm_eval/tasks/catalan_bench/catalan_bench.yaml b/lm_eval/tasks/catalan_bench/catalan_bench.yaml
new file mode 100644
index 0000000000..1f1f09ece2
--- /dev/null
+++ b/lm_eval/tasks/catalan_bench/catalan_bench.yaml
@@ -0,0 +1,25 @@
+group: catalan_bench
+task:
+    - belebele_cat_Latn
+    - xnli_ca
+    - catcola
+    - copa_ca
+    - openbookqa_ca
+    - parafraseja
+    - paws_ca
+    - piqa_ca
+    - siqa_ca
+    - teca
+    - wnli_ca
+    - arc_ca_easy
+    - arc_ca_challenge
+    - xstorycloze_ca
+    - xquad_ca
+    - catalanqa
+    - coqcat
+    - flores_ca
+    - cabreu
+    - mgsm_direct_ca
+    - phrases_va
+metadata:
+  version: 1.0
diff --git a/lm_eval/tasks/catalan_bench/catalanqa.yaml b/lm_eval/tasks/catalan_bench/catalanqa.yaml
new file mode 100644
index 0000000000..8861794e94
--- /dev/null
+++ b/lm_eval/tasks/catalan_bench/catalanqa.yaml
@@ -0,0 +1,25 @@
+task: catalanqa
+dataset_path: projecte-aina/catalanqa
+dataset_name: null
+output_type: generate_until
+training_split: train
+validation_split: validation
+test_split: test
+doc_to_text: "Context: {{context}}\n\nPregunta: {{question}}\n\nResposta:"
+doc_to_target: '{{answers[0]["text"]}}'
+target_delimiter: ' '
+process_results: !function utils.process_results_qa
+generation_kwargs:
+  until:
+    - "\n"
+  do_sample: false
+  temperature: 0.0
+metric_list:
+  - metric: exact_match
+    aggregation: mean
+    higher_is_better: true
+  - metric: f1
+    aggregation: mean
+    higher_is_better: true
+metadata:
+  version: 1.0
\ No newline at end of file
diff --git a/lm_eval/tasks/catalan_bench/catcola.yaml b/lm_eval/tasks/catalan_bench/catcola.yaml
new file mode 100644
index 0000000000..121b5e7f48
--- /dev/null
+++ b/lm_eval/tasks/catalan_bench/catcola.yaml
@@ -0,0 +1,14 @@
+task: catcola
+dataset_path: nbel/CatCoLA
+output_type: multiple_choice
+training_split: train
+validation_split: validation
+test_split: null
+doc_to_text: "{{Sentence}}\nPregunta: Té sentit aquesta frase?\nResposta:"
+doc_to_target: label
+doc_to_choice: ["no", "sí"]
+metric_list:
+  - metric: mcc
+  - metric: acc
+metadata:
+  version: 1.0
diff --git a/lm_eval/tasks/catalan_bench/copa_ca.yaml b/lm_eval/tasks/catalan_bench/copa_ca.yaml
new file mode 100644
index 0000000000..c7ca3f11a1
--- /dev/null
+++ b/lm_eval/tasks/catalan_bench/copa_ca.yaml
@@ -0,0 +1,17 @@
+task: copa_ca
+dataset_path: projecte-aina/COPA-ca
+dataset_name: null
+output_type: multiple_choice
+training_split: train
+validation_split: validation
+test_split: test
+process_docs: !function utils.process_docs_copa_ca
+doc_to_text: '{{premise[:-1].strip() + " " + {"cause": "perquè", "effect": "i per tant"}[question]}}'
+doc_to_target: '{{choice1 if label == 0 else choice2}}'
+doc_to_choice: '{{[choice1, choice2]}}'
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+metadata:
+  version: 1.0
\ No newline at end of file
diff --git a/lm_eval/tasks/catalan_bench/coqcat.yaml b/lm_eval/tasks/catalan_bench/coqcat.yaml
new file mode 100644
index 0000000000..12ab7d7cdf
--- /dev/null
+++ b/lm_eval/tasks/catalan_bench/coqcat.yaml
@@ -0,0 +1,23 @@
+task: coqcat
+dataset_path: projecte-aina/CoQCat
+output_type: generate_until
+training_split: train
+validation_split: validation
+test_split: test
+doc_to_text: '{{story+"\n\n"}}{% for i in range(questions|length-1) %}{{"Q: "+questions[i]+"\n\n"+"A: "+answers["input_text"][i]+"\n\n"}}{% endfor %}{{"Q: "+questions[-1]+"\n\n"+"A:"}}'
+doc_to_target: '{{ answers["input_text"][questions|length - 1] }}'
+process_results: !function utils.process_results_coqcat
+should_decontaminate: true
+doc_to_decontamination_query: "{{story}} {{question.input_text|join('\n')}}"
+generation_kwargs:
+  until:
+    - "\nQ:"
+metric_list:
+  - metric: "em"
+    aggregation: mean
+    higher_is_better: true
+  - metric: "f1"
+    aggregation: mean
+    higher_is_better: true
+metadata:
+  version: 1.0
\ No newline at end of file
diff --git a/lm_eval/tasks/catalan_bench/flores_ca/_flores_common_yaml b/lm_eval/tasks/catalan_bench/flores_ca/_flores_common_yaml
new file mode 100644
index 0000000000..075a66b5f4
--- /dev/null
+++ b/lm_eval/tasks/catalan_bench/flores_ca/_flores_common_yaml
@@ -0,0 +1,23 @@
+dataset_path: facebook/flores
+dataset_name: all
+output_type: generate_until
+training_split: dev
+validation_split: dev
+test_split: devtest
+fewshot_split: dev
+target_delimiter: ''
+generation_kwargs:
+  until:
+    - "\n"
+metric_list:
+  - metric: bleu
+    aggregation: bleu
+    higher_is_better: true
+  - metric: ter
+    aggregation: ter
+    higher_is_better: false
+  - metric: chrf
+    aggregation: chrf
+    higher_is_better: true
+metadata:
+  version: 1.0
diff --git a/lm_eval/tasks/catalan_bench/flores_ca/create-yamls_flores_ca.py b/lm_eval/tasks/catalan_bench/flores_ca/create-yamls_flores_ca.py
new file mode 100644
index 0000000000..eeadc3ed76
--- /dev/null
+++ b/lm_eval/tasks/catalan_bench/flores_ca/create-yamls_flores_ca.py
@@ -0,0 +1,115 @@
+"""
+Script to generate task YAMLs for the FLORES-200 dataset.
+Based on `tasks/translation/utils.py`.
+"""
+
+import argparse
+import yaml
+from langcodes import *
+from itertools import *
+
+# utils
+flatten = lambda l: list(itertools.chain(*l))
+
+# constants
+_LANGUAGES = [
+"ace_Arab",  "bam_Latn",  "dzo_Tibt",  "hin_Deva",	"khm_Khmr",  "mag_Deva",  "pap_Latn",  "sot_Latn",	"tur_Latn",
+"ace_Latn",  "ban_Latn",  "ell_Grek",  "hne_Deva",	"kik_Latn",  "mai_Deva",  "pbt_Arab",  "spa_Latn",	"twi_Latn",
+"acm_Arab",  "bel_Cyrl",  "eng_Latn",  "hrv_Latn",	"kin_Latn",  "mal_Mlym",  "pes_Arab",  "srd_Latn",	"tzm_Tfng",
+"acq_Arab",  "bem_Latn",  "epo_Latn",  "hun_Latn",	"kir_Cyrl",  "mar_Deva",  "plt_Latn",  "srp_Cyrl",	"uig_Arab",
+"aeb_Arab",  "ben_Beng",  "est_Latn",  "hye_Armn",	"kmb_Latn",  "min_Arab",  "pol_Latn",  "ssw_Latn",	"ukr_Cyrl",
+"afr_Latn",  "bho_Deva",  "eus_Latn",  "ibo_Latn",	"kmr_Latn",  "min_Latn",  "por_Latn",  "sun_Latn",	"umb_Latn",
+"ajp_Arab",  "bjn_Arab",  "ewe_Latn",  "ilo_Latn",	"knc_Arab",  "mkd_Cyrl",  "prs_Arab",  "swe_Latn",	"urd_Arab",
+"aka_Latn",  "bjn_Latn",  "fao_Latn",  "ind_Latn",	"knc_Latn",  "mlt_Latn",  "quy_Latn",  "swh_Latn",	"uzn_Latn",
+"als_Latn",  "bod_Tibt",  "fij_Latn",  "isl_Latn",	"kon_Latn",  "mni_Beng",  "ron_Latn",  "szl_Latn",	"vec_Latn",
+"amh_Ethi",  "bos_Latn",  "fin_Latn",  "ita_Latn",	"kor_Hang",  "mos_Latn",  "run_Latn",  "tam_Taml",	"vie_Latn",
+"apc_Arab",  "bug_Latn",  "fon_Latn",  "jav_Latn",	"lao_Laoo",  "mri_Latn",  "rus_Cyrl",  "taq_Latn",	"war_Latn",
+"arb_Arab",  "bul_Cyrl",  "fra_Latn",  "jpn_Jpan",	"lij_Latn",  "mya_Mymr",  "sag_Latn",  "taq_Tfng",	"wol_Latn",
+"arb_Latn",  "cat_Latn",  "fur_Latn",  "kab_Latn",	"lim_Latn",  "nld_Latn",  "san_Deva",  "tat_Cyrl",	"xho_Latn",
+"ars_Arab",  "ceb_Latn",  "fuv_Latn",  "kac_Latn",	"lin_Latn",  "nno_Latn",  "sat_Olck",  "tel_Telu",	"ydd_Hebr",
+"ary_Arab",  "ces_Latn",  "gaz_Latn",  "kam_Latn",	"lit_Latn",  "nob_Latn",  "scn_Latn",  "tgk_Cyrl",	"yor_Latn",
+"arz_Arab",  "cjk_Latn",  "gla_Latn",  "kan_Knda",	"lmo_Latn",  "npi_Deva",  "shn_Mymr",  "tgl_Latn",	"yue_Hant",
+"asm_Beng",  "ckb_Arab",  "gle_Latn",  "kas_Arab",	"ltg_Latn",  "nso_Latn",  "sin_Sinh",  "tha_Thai",	"zho_Hans",
+"ast_Latn",  "crh_Latn",  "glg_Latn",  "kas_Deva",	"ltz_Latn",  "nus_Latn",  "slk_Latn",  "tir_Ethi",	"zho_Hant",
+"awa_Deva",  "cym_Latn",  "grn_Latn",  "kat_Geor",	"lua_Latn",  "nya_Latn",  "slv_Latn",  "tpi_Latn",	"zsm_Latn",
+"ayr_Latn",  "dan_Latn",  "guj_Gujr",  "kaz_Cyrl",	"lug_Latn",  "oci_Latn",  "smo_Latn",  "tsn_Latn",	"zul_Latn",
+"azb_Arab",  "deu_Latn",  "hat_Latn",  "kbp_Latn",	"luo_Latn",  "ory_Orya",  "sna_Latn",  "tso_Latn",
+"azj_Latn",  "dik_Latn",  "hau_Latn",  "kea_Latn",	"lus_Latn",  "pag_Latn",  "snd_Arab",  "tuk_Latn",
+"bak_Cyrl",  "dyu_Latn",  "heb_Hebr",  "khk_Cyrl",	"lvs_Latn",  "pan_Guru",  "som_Latn",  "tum_Latn"
+]
+LANGUAGE_PAIRS = [(a, b) for idx, a in enumerate(_LANGUAGES) for b in _LANGUAGES[idx + 1:]]
+
+LANGUAGES_OF_INTEREST = ["cat_Latn", "spa_Latn", "eng_Latn", "glg_Latn", "eus_Latn", "ita_Latn", "deu_Latn", "por_Latn", "fra_Latn"]
+MAIN_LANG = "cat_Latn"
+LANGUAGE_PAIRS = [(a, b) for (a, b) in LANGUAGE_PAIRS if a in LANGUAGES_OF_INTEREST and b in LANGUAGES_OF_INTEREST and "cat_Latn" in (a, b)]
+
+# auxiliary functions
+
+code_to_language_name = lambda code: Language.make(language=Language.get(code)["language"]).display_name()
+code_to_short_name = lambda code: Language.get(code)["language"]
+jinja_var = lambda s: "{{" + s + "}}" # wrapper to avoid having to escape { } in format strings
+
+def doc_to_text(src: str, tgt: str) -> str:
+    src_name, tgt_name = map(code_to_language_name, [src, tgt])
+
+    return f"""\
+{src_name} sentence: {jinja_var('sentence_' + src)}
+{tgt_name} sentence:"""
+
+def doc_to_target(tgt: str) -> str:
+
+    return f"{jinja_var('sentence_' + tgt)}"
+
+# main function
+
+def gen_lang_yamls(output_dir: str, overwrite: bool) -> None:
+    """
+    Generate a YAML file for each translation direction.
+    """
+
+    err = []
+    for src, tgt in LANGUAGE_PAIRS:
+
+        # do both translation directions for each lang pair
+        for src, tgt in [(src, tgt), (tgt, src)]:
+            lang_pair_name = f"{code_to_short_name(src)}-{code_to_short_name(tgt)}"
+            yaml_file_name = f"flores_{lang_pair_name}.yaml"
+
+            try:
+                with open( f"{output_dir}/{yaml_file_name}", "w" if overwrite else "x", encoding="utf-8") as outfile:
+                    print(f"Creating {yaml_file_name}...")
+                    outfile.write("# File generated by `create-yamls.py`\n")
+                    yaml.dump(
+                        {
+#                             "group": [f"{BENCH_NAME}_bench", f"{BENCH_NAME}_bench_flores"],
+                            "group": "flores_ca",
+                            "include": "_flores_common_yaml",
+                            "task": f"flores_{lang_pair_name}",
+                            "doc_to_text": doc_to_text(src, tgt),
+                            "doc_to_target": doc_to_target(tgt),
+                        },
+                        outfile,
+                        sort_keys=False,
+                    )
+
+            except FileExistsError:
+                err.append(yaml_file_name)
+
+    if len(err) > 0:
+        raise FileExistsError(
+            "Files were not created because they already exist:"
+            f" {', '.join(err)}"
+            "\nUse flag --overwrite to overwrite them."
+        )
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--overwrite", default=False, action="store_true", help="Overwrite files if they already exist")
+    parser.add_argument( "--output-dir", default=".", help="Directory to write yaml files to" )
+    args = parser.parse_args()
+
+    gen_lang_yamls(output_dir=args.output_dir, overwrite=args.overwrite)
+
+if __name__ == "__main__":
+    main()
diff --git a/lm_eval/tasks/catalan_bench/flores_ca/flores_ca-de.yaml b/lm_eval/tasks/catalan_bench/flores_ca/flores_ca-de.yaml
new file mode 100644
index 0000000000..0bb415a04b
--- /dev/null
+++ b/lm_eval/tasks/catalan_bench/flores_ca/flores_ca-de.yaml
@@ -0,0 +1,8 @@
+# File generated by `create-yamls.py`
+group: flores_ca
+include: _flores_common_yaml
+task: flores_ca-de
+doc_to_text: 'Catalan sentence: {{sentence_cat_Latn}}
+
+  German sentence:'
+doc_to_target: '{{sentence_deu_Latn}}'
diff --git a/lm_eval/tasks/catalan_bench/flores_ca/flores_ca-en.yaml b/lm_eval/tasks/catalan_bench/flores_ca/flores_ca-en.yaml
new file mode 100644
index 0000000000..3aa1351307
--- /dev/null
+++ b/lm_eval/tasks/catalan_bench/flores_ca/flores_ca-en.yaml
@@ -0,0 +1,8 @@
+# File generated by `create-yamls.py`
+group: flores_ca
+include: _flores_common_yaml
+task: flores_ca-en
+doc_to_text: 'Catalan sentence: {{sentence_cat_Latn}}
+
+  English sentence:'
+doc_to_target: '{{sentence_eng_Latn}}'
diff --git a/lm_eval/tasks/catalan_bench/flores_ca/flores_ca-es.yaml b/lm_eval/tasks/catalan_bench/flores_ca/flores_ca-es.yaml
new file mode 100644
index 0000000000..1b0cf7c7f0
--- /dev/null
+++ b/lm_eval/tasks/catalan_bench/flores_ca/flores_ca-es.yaml
@@ -0,0 +1,8 @@
+# File generated by `create-yamls.py`
+group: flores_ca
+include: _flores_common_yaml
+task: flores_ca-es
+doc_to_text: 'Catalan sentence: {{sentence_cat_Latn}}
+
+  Spanish sentence:'
+doc_to_target: '{{sentence_spa_Latn}}'
diff --git a/lm_eval/tasks/catalan_bench/flores_ca/flores_ca-eu.yaml b/lm_eval/tasks/catalan_bench/flores_ca/flores_ca-eu.yaml
new file mode 100644
index 0000000000..fd540c6425
--- /dev/null
+++ b/lm_eval/tasks/catalan_bench/flores_ca/flores_ca-eu.yaml
@@ -0,0 +1,8 @@
+# File generated by `create-yamls.py`
+group: flores_ca
+include: _flores_common_yaml
+task: flores_ca-eu
+doc_to_text: 'Catalan sentence: {{sentence_cat_Latn}}
+
+  Basque sentence:'
+doc_to_target: '{{sentence_eus_Latn}}'
diff --git a/lm_eval/tasks/catalan_bench/flores_ca/flores_ca-fr.yaml b/lm_eval/tasks/catalan_bench/flores_ca/flores_ca-fr.yaml
new file mode 100644
index 0000000000..5aa495781d
--- /dev/null
+++ b/lm_eval/tasks/catalan_bench/flores_ca/flores_ca-fr.yaml
@@ -0,0 +1,8 @@
+# File generated by `create-yamls.py`
+group: flores_ca
+include: _flores_common_yaml
+task: flores_ca-fr
+doc_to_text: 'Catalan sentence: {{sentence_cat_Latn}}
+
+  French sentence:'
+doc_to_target: '{{sentence_fra_Latn}}'
diff --git a/lm_eval/tasks/catalan_bench/flores_ca/flores_ca-gl.yaml b/lm_eval/tasks/catalan_bench/flores_ca/flores_ca-gl.yaml
new file mode 100644
index 0000000000..d33cdd505e
--- /dev/null
+++ b/lm_eval/tasks/catalan_bench/flores_ca/flores_ca-gl.yaml
@@ -0,0 +1,8 @@
+# File generated by `create-yamls.py`
+group: flores_ca
+include: _flores_common_yaml
+task: flores_ca-gl
+doc_to_text: 'Catalan sentence: {{sentence_cat_Latn}}
+
+  Galician sentence:'
+doc_to_target: '{{sentence_glg_Latn}}'
diff --git a/lm_eval/tasks/catalan_bench/flores_ca/flores_ca-it.yaml b/lm_eval/tasks/catalan_bench/flores_ca/flores_ca-it.yaml
new file mode 100644
index 0000000000..61431e8da2
--- /dev/null
+++ b/lm_eval/tasks/catalan_bench/flores_ca/flores_ca-it.yaml
@@ -0,0 +1,8 @@
+# File generated by `create-yamls.py`
+group: flores_ca
+include: _flores_common_yaml
+task: flores_ca-it
+doc_to_text: 'Catalan sentence: {{sentence_cat_Latn}}
+
+  Italian sentence:'
+doc_to_target: '{{sentence_ita_Latn}}'
diff --git a/lm_eval/tasks/catalan_bench/flores_ca/flores_ca-pt.yaml b/lm_eval/tasks/catalan_bench/flores_ca/flores_ca-pt.yaml
new file mode 100644
index 0000000000..3b0e70b550
--- /dev/null
+++ b/lm_eval/tasks/catalan_bench/flores_ca/flores_ca-pt.yaml
@@ -0,0 +1,8 @@
+# File generated by `create-yamls.py`
+group: flores_ca
+include: _flores_common_yaml
+task: flores_ca-pt
+doc_to_text: 'Catalan sentence: {{sentence_cat_Latn}}
+
+  Portuguese sentence:'
+doc_to_target: '{{sentence_por_Latn}}'
diff --git a/lm_eval/tasks/catalan_bench/flores_ca/flores_de-ca.yaml b/lm_eval/tasks/catalan_bench/flores_ca/flores_de-ca.yaml
new file mode 100644
index 0000000000..363bd62ced
--- /dev/null
+++ b/lm_eval/tasks/catalan_bench/flores_ca/flores_de-ca.yaml
@@ -0,0 +1,8 @@
+# File generated by `create-yamls.py`
+group: flores_ca
+include: _flores_common_yaml
+task: flores_de-ca
+doc_to_text: 'German sentence: {{sentence_deu_Latn}}
+
+  Catalan sentence:'
+doc_to_target: '{{sentence_cat_Latn}}'
diff --git a/lm_eval/tasks/catalan_bench/flores_ca/flores_en-ca.yaml b/lm_eval/tasks/catalan_bench/flores_ca/flores_en-ca.yaml
new file mode 100644
index 0000000000..81706f6dc2
--- /dev/null
+++ b/lm_eval/tasks/catalan_bench/flores_ca/flores_en-ca.yaml
@@ -0,0 +1,8 @@
+# File generated by `create-yamls.py`
+group: flores_ca
+include: _flores_common_yaml
+task: flores_en-ca
+doc_to_text: 'English sentence: {{sentence_eng_Latn}}
+
+  Catalan sentence:'
+doc_to_target: '{{sentence_cat_Latn}}'
diff --git a/lm_eval/tasks/catalan_bench/flores_ca/flores_es-ca.yaml b/lm_eval/tasks/catalan_bench/flores_ca/flores_es-ca.yaml
new file mode 100644
index 0000000000..a74437e392
--- /dev/null
+++ b/lm_eval/tasks/catalan_bench/flores_ca/flores_es-ca.yaml
@@ -0,0 +1,8 @@
+# File generated by `create-yamls.py`
+group: flores_ca
+include: _flores_common_yaml
+task: flores_es-ca
+doc_to_text: 'Spanish sentence: {{sentence_spa_Latn}}
+
+  Catalan sentence:'
+doc_to_target: '{{sentence_cat_Latn}}'
diff --git a/lm_eval/tasks/catalan_bench/flores_ca/flores_eu-ca.yaml b/lm_eval/tasks/catalan_bench/flores_ca/flores_eu-ca.yaml
new file mode 100644
index 0000000000..dcf6e3760f
--- /dev/null
+++ b/lm_eval/tasks/catalan_bench/flores_ca/flores_eu-ca.yaml
@@ -0,0 +1,8 @@
+# File generated by `create-yamls.py`
+group: flores_ca
+include: _flores_common_yaml
+task: flores_eu-ca
+doc_to_text: 'Basque sentence: {{sentence_eus_Latn}}
+
+  Catalan sentence:'
+doc_to_target: '{{sentence_cat_Latn}}'
diff --git a/lm_eval/tasks/catalan_bench/flores_ca/flores_fr-ca.yaml b/lm_eval/tasks/catalan_bench/flores_ca/flores_fr-ca.yaml
new file mode 100644
index 0000000000..a5abc7ead8
--- /dev/null
+++ b/lm_eval/tasks/catalan_bench/flores_ca/flores_fr-ca.yaml
@@ -0,0 +1,8 @@
+# File generated by `create-yamls.py`
+group: flores_ca
+include: _flores_common_yaml
+task: flores_fr-ca
+doc_to_text: 'French sentence: {{sentence_fra_Latn}}
+
+  Catalan sentence:'
+doc_to_target: '{{sentence_cat_Latn}}'
diff --git a/lm_eval/tasks/catalan_bench/flores_ca/flores_gl-ca.yaml b/lm_eval/tasks/catalan_bench/flores_ca/flores_gl-ca.yaml
new file mode 100644
index 0000000000..78c554086f
--- /dev/null
+++ b/lm_eval/tasks/catalan_bench/flores_ca/flores_gl-ca.yaml
@@ -0,0 +1,8 @@
+# File generated by `create-yamls.py`
+group: flores_ca
+include: _flores_common_yaml
+task: flores_gl-ca
+doc_to_text: 'Galician sentence: {{sentence_glg_Latn}}
+
+  Catalan sentence:'
+doc_to_target: '{{sentence_cat_Latn}}'
diff --git a/lm_eval/tasks/catalan_bench/flores_ca/flores_it-ca.yaml b/lm_eval/tasks/catalan_bench/flores_ca/flores_it-ca.yaml
new file mode 100644
index 0000000000..128834d976
--- /dev/null
+++ b/lm_eval/tasks/catalan_bench/flores_ca/flores_it-ca.yaml
@@ -0,0 +1,8 @@
+# File generated by `create-yamls.py`
+group: flores_ca
+include: _flores_common_yaml
+task: flores_it-ca
+doc_to_text: 'Italian sentence: {{sentence_ita_Latn}}
+
+  Catalan sentence:'
+doc_to_target: '{{sentence_cat_Latn}}'
diff --git a/lm_eval/tasks/catalan_bench/flores_ca/flores_pt-ca.yaml b/lm_eval/tasks/catalan_bench/flores_ca/flores_pt-ca.yaml
new file mode 100644
index 0000000000..dd355b797c
--- /dev/null
+++ b/lm_eval/tasks/catalan_bench/flores_ca/flores_pt-ca.yaml
@@ -0,0 +1,8 @@
+# File generated by `create-yamls.py`
+group: flores_ca
+include: _flores_common_yaml
+task: flores_pt-ca
+doc_to_text: 'Portuguese sentence: {{sentence_por_Latn}}
+
+  Catalan sentence:'
+doc_to_target: '{{sentence_cat_Latn}}'
diff --git a/lm_eval/tasks/catalan_bench/mgsm_direct_ca.yaml b/lm_eval/tasks/catalan_bench/mgsm_direct_ca.yaml
new file mode 100644
index 0000000000..754d5b91ce
--- /dev/null
+++ b/lm_eval/tasks/catalan_bench/mgsm_direct_ca.yaml
@@ -0,0 +1,27 @@
+group:
+    - mgsm_direct
+task: mgsm_direct_ca
+dataset_path: projecte-aina/mgsm_ca
+doc_to_target: '{{answer_number|string}}'
+doc_to_text: '{% if answer != None %}{{question + "\nResposta: "}}{% else %}{{"Pregunta: " + question + "\nResposta: "}}{% endif %}'
+output_type: generate_until
+training_split: train
+test_split: test
+target_delimiter: ""
+generation_kwargs:
+  until:
+    - "\n\n"
+    - "\n"
+filter_list:
+  - name: remove_whitespace
+    filter:
+      - function: remove_whitespace
+      - function: take_first
+metric_list:
+  - metric: exact_match
+    aggregation: mean
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: true
+metadata:
+  version: 1.0
diff --git a/lm_eval/tasks/catalan_bench/openbookqa_ca.yaml b/lm_eval/tasks/catalan_bench/openbookqa_ca.yaml
new file mode 100644
index 0000000000..00a1f03d4f
--- /dev/null
+++ b/lm_eval/tasks/catalan_bench/openbookqa_ca.yaml
@@ -0,0 +1,20 @@
+task: openbookqa_ca
+dataset_path: projecte-aina/openbookqa_ca
+output_type: multiple_choice
+training_split: null
+validation_split: validation
+test_split: test
+doc_to_text: question_stem
+doc_to_target: "{{choices.label.index(answerKey.lstrip())}}"
+doc_to_choice: "{{choices.text}}"
+should_decontaminate: true
+doc_to_decontamination_query: question_stem
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+  - metric: acc_norm
+    aggregation: mean
+    higher_is_better: true
+metadata:
+  version: 1.0
\ No newline at end of file
diff --git a/lm_eval/tasks/catalan_bench/parafraseja.yaml b/lm_eval/tasks/catalan_bench/parafraseja.yaml
new file mode 100644
index 0000000000..208e3e373f
--- /dev/null
+++ b/lm_eval/tasks/catalan_bench/parafraseja.yaml
@@ -0,0 +1,17 @@
+task: parafraseja
+dataset_path: projecte-aina/Parafraseja
+output_type: multiple_choice
+dataset_name: null
+test_split: test
+training_split: train
+validation_split: validation
+doc_to_choice: '{{[sentence1+", veritat? No, "+sentence2, sentence1+", veritat? Sí, "+sentence2]}}'
+process_docs: !function utils.process_docs_paraphrases
+doc_to_text: ''
+doc_to_target: label
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+metadata:
+  version: 1.0
\ No newline at end of file
diff --git a/lm_eval/tasks/catalan_bench/paws_ca.yaml b/lm_eval/tasks/catalan_bench/paws_ca.yaml
new file mode 100644
index 0000000000..c9fbd04f9a
--- /dev/null
+++ b/lm_eval/tasks/catalan_bench/paws_ca.yaml
@@ -0,0 +1,20 @@
+group:
+    - pawsx
+task: paws_ca
+dataset_path: projecte-aina/PAWS-ca
+dataset_name: null
+output_type: multiple_choice
+training_split: train
+validation_split: validation
+test_split: test
+process_docs: !function utils.process_docs_paraphrases
+doc_to_text: ''
+doc_to_target: label
+doc_to_choice: '{{[sentence1+", veritat? No, "+sentence2, sentence1+", veritat? Sí, "+sentence2]}}'
+target_delimiter: ''
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+metadata:
+  version: 1.0
\ No newline at end of file
diff --git a/lm_eval/tasks/catalan_bench/phrases_va/_phrases_va_common.yaml b/lm_eval/tasks/catalan_bench/phrases_va/_phrases_va_common.yaml
new file mode 100644
index 0000000000..48232725b2
--- /dev/null
+++ b/lm_eval/tasks/catalan_bench/phrases_va/_phrases_va_common.yaml
@@ -0,0 +1,24 @@
+group: phrases_va
+dataset_path: gplsi/CA-VA_alignment_test
+output_type: generate_until
+training_split: null
+validation_split: null
+test_split: test
+fewshot_split: test
+num_fewshot: 5
+target_delimiter: ' '
+generation_kwargs:
+  until:
+    - "\n"
+metric_list:
+  - metric: bleu
+    aggregation: bleu
+    higher_is_better: true
+  - metric: ter
+    aggregation: ter
+    higher_is_better: false
+  - metric: chrf
+    aggregation: chrf
+    higher_is_better: true
+metadata:
+  version: 1.0
diff --git a/lm_eval/tasks/catalan_bench/phrases_va/phrases_ca-va.yaml b/lm_eval/tasks/catalan_bench/phrases_va/phrases_ca-va.yaml
new file mode 100644
index 0000000000..fc0e08d5a2
--- /dev/null
+++ b/lm_eval/tasks/catalan_bench/phrases_va/phrases_ca-va.yaml
@@ -0,0 +1,7 @@
+# File generated by `create-yamls.py`
+include: _phrases_va_common.yaml
+task: phrases_ca-va
+doc_to_text: 'Oració en català: {{ca}}
+
+  Oració en valencià:'
+doc_to_target: '{{va}}'
diff --git a/lm_eval/tasks/catalan_bench/phrases_va/phrases_va-ca.yaml b/lm_eval/tasks/catalan_bench/phrases_va/phrases_va-ca.yaml
new file mode 100644
index 0000000000..5b1a76780a
--- /dev/null
+++ b/lm_eval/tasks/catalan_bench/phrases_va/phrases_va-ca.yaml
@@ -0,0 +1,7 @@
+# File generated by `create-yamls.py`
+include: _phrases_va_common.yaml
+task: phrases_va-ca
+doc_to_text: 'Oració en valencià: {{va}}
+
+  Oració en català:'
+doc_to_target: '{{ca}}'
diff --git a/lm_eval/tasks/catalan_bench/piqa_ca.yaml b/lm_eval/tasks/catalan_bench/piqa_ca.yaml
new file mode 100644
index 0000000000..11e600a7f1
--- /dev/null
+++ b/lm_eval/tasks/catalan_bench/piqa_ca.yaml
@@ -0,0 +1,21 @@
+task: piqa_ca
+dataset_path: projecte-aina/piqa_ca
+dataset_name: null
+output_type: multiple_choice
+training_split: null
+validation_split: validation
+test_split: null
+doc_to_text: "Pregunta: {{goal}}\nResposta:"
+doc_to_target: label
+doc_to_choice: "{{[sol1, sol2]}}"
+should_decontaminate: true
+doc_to_decontamination_query: goal
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+  - metric: acc_norm
+    aggregation: mean
+    higher_is_better: true
+metadata:
+  version: 1.0
diff --git a/lm_eval/tasks/catalan_bench/siqa_ca.yaml b/lm_eval/tasks/catalan_bench/siqa_ca.yaml
new file mode 100644
index 0000000000..01f0651b7c
--- /dev/null
+++ b/lm_eval/tasks/catalan_bench/siqa_ca.yaml
@@ -0,0 +1,16 @@
+task: siqa_ca
+dataset_path: projecte-aina/siqa_ca
+output_type: multiple_choice
+training_split: null
+validation_split: validation
+test_split: null
+doc_to_text: "Pregunta: {{context}} {{question}}\nResposta:"
+target_delimiter: " "
+doc_to_choice: "{{[answerA, answerB, answerC]}}"
+doc_to_target: "{{ (label|int) - 1 }}"
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+metadata:
+  version: 1.0
\ No newline at end of file
diff --git a/lm_eval/tasks/catalan_bench/teca.yaml b/lm_eval/tasks/catalan_bench/teca.yaml
new file mode 100644
index 0000000000..c60acbd559
--- /dev/null
+++ b/lm_eval/tasks/catalan_bench/teca.yaml
@@ -0,0 +1,18 @@
+task: teca
+dataset_path: projecte-aina/teca
+dataset_name: null
+training_split: train
+validation_split: validation
+test_split: test
+output_type: multiple_choice
+process_docs: !function utils.process_doc_nli
+doc_to_text: ""
+doc_to_target: label
+target_delimiter: ""
+doc_to_choice: '{{[premise + ", correcte? Sí, " + hypothesis, premise + ", correcte? A més, " + hypothesis, premise + ", correcte? No, " + hypothesis]}}'
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+metadata:
+  version: 1.0
\ No newline at end of file
diff --git a/lm_eval/tasks/catalan_bench/utils.py b/lm_eval/tasks/catalan_bench/utils.py
new file mode 100644
index 0000000000..650749dc59
--- /dev/null
+++ b/lm_eval/tasks/catalan_bench/utils.py
@@ -0,0 +1,122 @@
+import re
+from itertools import product
+import evaluate
+import transformers.data.metrics.squad_metrics as squad_metrics
+from lm_eval.utils import general_detokenize
+
+
+def lowercase_first_letter(text):
+	return text[0].lower() + text[1:]
+
+def process_doc_nli(dataset):
+    def process_fn(doc):
+        # Detokenize(remove extra whitespaces)
+        doc["premise"] = general_detokenize(doc["premise"]).strip()
+        doc["hypothesis"] = general_detokenize(doc["hypothesis"]).strip()
+	    # Remove last punctuation mark in the premise
+        doc["premise"] = doc["premise"][:-1] if doc["premise"].endswith((".", ",", "!", "?")) else doc["premise"]
+	    # Lowercase the first letter in the hypothesis
+        doc["hypothesis"] = lowercase_first_letter(doc["hypothesis"])
+	    # Ensure that the hypothesis ends with a dot
+        doc["hypothesis"] = (doc["hypothesis"] + ".") if not doc["hypothesis"].endswith(".") else doc["hypothesis"]
+        return doc
+    return dataset.map(process_fn)
+
+
+def process_results_coqcat(doc, results):
+    # Get all possible answers and compute the scores
+    turn_id = len(doc["questions"])
+    answers = [doc["answers"]["input_text"][turn_id - 1]]
+    additional_answers_list = doc.get("additional_answers")
+    if additional_answers_list:
+        for key, additional_answers in additional_answers_list.items():
+            if additional_answers["input_text"][turn_id - 1].lower() not in map(str.lower, answers):
+                answers.append(additional_answers["input_text"][turn_id - 1])
+
+    gold_list = answers
+    pred = results[0].strip().split("\n")[0]
+    #import code; code.interact(local=dict(globals(), **locals())) 
+    
+    f1_sum = 0.0
+    em_sum = 0.0
+    if len(gold_list) > 1:
+        for i in range(len(gold_list)):
+            gold_answers = gold_list[0:i] + gold_list[i + 1:]
+            # predictions compared against (n) golds and take maximum
+            em_sum += max(squad_metrics.compute_exact(a, pred) for a in gold_answers)
+            f1_sum += max(squad_metrics.compute_f1(a, pred) for a in gold_answers)
+    else:
+        em_sum += max(squad_metrics.compute_exact(a, pred) for a in gold_list)
+        f1_sum += max(squad_metrics.compute_f1(a, pred) for a in gold_list)
+    #import code; code.interact(local=dict(globals(), **locals()))
+    return {
+        "em": em_sum / max(1, len(gold_list)),
+        "f1": f1_sum / max(1, len(gold_list)),
+    }
+
+def process_results_qa(doc, results):
+    preds = results[0]
+    reference = doc["answers"][0]["text"]
+    #import code; code.interact(local=dict(globals(), **locals()))
+    f1_sum = squad_metrics.compute_f1(reference, preds)
+    exact_match = squad_metrics.compute_exact(reference, preds)
+    return {
+    	"f1": f1_sum,
+    	"exact_match": exact_match
+    }	
+
+def process_doc_cabreu(dataset):
+    def process_fn(doc):
+        # Remove duplicate spaces
+        doc["content"] = re.sub(r" +", " ", doc["content"])
+        for summary_type, index in product(["abstractive", "extractive", "extreme"], ["a1", "a2", "a3"]):
+            doc["summaries"][summary_type][index] = re.sub(r" +", " ", doc["summaries"][summary_type][index])
+        return doc
+
+    return dataset.map(process_fn)
+
+def process_docs_paraphrases(dataset):
+    empty_docs = []
+    def _process_doc(doc):
+        if doc["sentence1"] not in [None, ""] and doc["sentence2"] not in [None, ""]:
+            doc["sentence1"] = general_detokenize(doc["sentence1"]).strip()
+            doc["sentence2"] = general_detokenize(doc["sentence2"]).strip()
+            # Remove final punctuation mark in the first sentence
+            if doc["sentence1"].endswith((".", ",", ";")):
+                doc["sentence1"] = doc["sentence1"][:-1]
+            # Start the second sentence in lowercase (to be used after "Yes, ...")
+            doc["sentence2"] = lowercase_first_letter(doc["sentence2"])
+            return doc
+        else:
+            empty_docs.append(doc)
+            return doc
+    if empty_docs != []:
+        len_empty_docs = len(empty_docs)
+        print(f"Found {len_empty_docs} empty documents out of the {len(dataset)} total docs in the dataset: {empty_docs}")
+    return dataset.filter(lambda doc: doc["sentence1"] not in [None, ""] and doc["sentence2"] not in [None, ""]).map(_process_doc)
+
+
+def process_docs_copa_ca(dataset):
+    def _process_doc(doc):
+        doc["choice1"] = lowercase_first_letter(doc["choice1"])
+        doc["choice2"] = lowercase_first_letter(doc["choice2"])
+        return doc
+    return dataset.map(_process_doc)
+
+
+def rouge1(items):
+    """
+    # passthrough for efficiency
+    """
+    return items
+
+
+def rouge1_agg(items):
+    """
+    Higher is better
+    """
+    refs = list(zip(*items))[0]
+    preds = list(zip(*items))[1]
+    rouge_scorer = evaluate.load("rouge")
+    return rouge_scorer.compute(predictions=preds, references=refs)["rouge1"]
+
diff --git a/lm_eval/tasks/catalan_bench/wnli_ca.yaml b/lm_eval/tasks/catalan_bench/wnli_ca.yaml
new file mode 100644
index 0000000000..ba9d8b1e8a
--- /dev/null
+++ b/lm_eval/tasks/catalan_bench/wnli_ca.yaml
@@ -0,0 +1,14 @@
+task: wnli_ca
+dataset_path: projecte-aina/wnli-ca
+dataset_name: null
+output_type: multiple_choice
+training_split: train
+validation_split: validation
+test_split: null
+doc_to_text: "{{sentence1}}\nPregunta: {{sentence2}} Cert o Fals?\nResposta:"
+doc_to_target: label
+doc_to_choice: ["Fals", "Cert"]
+metric_list:
+  - metric: acc
+metadata:
+  version: 1.0
\ No newline at end of file
diff --git a/lm_eval/tasks/catalan_bench/xnli_ca.yaml b/lm_eval/tasks/catalan_bench/xnli_ca.yaml
new file mode 100644
index 0000000000..959b4775d1
--- /dev/null
+++ b/lm_eval/tasks/catalan_bench/xnli_ca.yaml
@@ -0,0 +1,21 @@
+group:
+    - xnli
+task: xnli_ca
+dataset_path: projecte-aina/xnli-ca
+dataset_name: null
+include: ../xnli/xnli_common_yaml
+output_type: multiple_choice
+doc_to_choice: '{{[premise+", correcte? Sí, "+hypothesis,premise+", correcte? A més,
+  "+hypothesis,premise+", correcte? No, "+hypothesis]}}'
+doc_to_text: ''
+target_delimiter: ''
+process_docs: !function utils.process_doc_nli
+training_split: null
+validation_split: validation
+doc_to_target: label
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+metadata:
+  version: 1.0
\ No newline at end of file
diff --git a/lm_eval/tasks/catalan_bench/xquad_ca.yaml b/lm_eval/tasks/catalan_bench/xquad_ca.yaml
new file mode 100644
index 0000000000..e70a59a978
--- /dev/null
+++ b/lm_eval/tasks/catalan_bench/xquad_ca.yaml
@@ -0,0 +1,25 @@
+task: xquad_ca
+dataset_path: projecte-aina/xquad-ca
+dataset_name: null
+output_type: generate_until
+doc_to_text: "Context: {{context}}\n\nPregunta: {{question}}\n\nResposta:"
+doc_to_target: '{{answers[0]["text"]}}'
+validation_split: null
+test_split: test
+target_delimiter: ' '
+process_results: !function utils.process_results_qa
+test_split: test
+generation_kwargs:
+  until:
+    - "\n"
+  do_sample: false
+  temperature: 0.0
+metric_list:
+  - metric: exact_match
+    aggregation: mean
+    higher_is_better: true
+  - metric: f1
+    aggregation: mean
+    higher_is_better: true
+metadata:
+  version: 1.0
\ No newline at end of file
diff --git a/lm_eval/tasks/catalan_bench/xstorycloze_ca.yaml b/lm_eval/tasks/catalan_bench/xstorycloze_ca.yaml
new file mode 100644
index 0000000000..11491d9267
--- /dev/null
+++ b/lm_eval/tasks/catalan_bench/xstorycloze_ca.yaml
@@ -0,0 +1,17 @@
+task: xstorycloze_ca
+dataset_path: projecte-aina/xstorycloze_ca
+dataset_name: ca
+output_type: multiple_choice
+training_split: train
+validation_split: eval
+doc_to_text: "{{[input_sentence_1, input_sentence_2, input_sentence_3, input_sentence_4]|join(' ')}}"
+doc_to_target: "{{answer_right_ending-1}}"
+doc_to_choice: "{{[sentence_quiz1, sentence_quiz2]}}"
+should_decontaminate: true
+doc_to_decontamination_query: "{{[input_sentence_1, input_sentence_2, input_sentence_3, input_sentence_4]|join(' ')}}"
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+metadata:
+  version: 1.0
\ No newline at end of file

From 19fc11308db1f465e7886e1c423cf5bde828ad5f Mon Sep 17 00:00:00 2001
From: zxcvuser <irene.bauce@gmail.com>
Date: Mon, 29 Jul 2024 17:37:50 +0200
Subject: [PATCH 2/4] added flores_ca.yaml

---
 .../flores_ca/_flores_common_yaml             |  2 ++
 .../flores_ca/create-yamls_flores_ca.py       |  2 +-
 .../catalan_bench/flores_ca/flores_ca-de.yaml |  1 -
 .../catalan_bench/flores_ca/flores_ca-en.yaml |  1 -
 .../catalan_bench/flores_ca/flores_ca-es.yaml |  1 -
 .../catalan_bench/flores_ca/flores_ca-eu.yaml |  1 -
 .../catalan_bench/flores_ca/flores_ca-fr.yaml |  1 -
 .../catalan_bench/flores_ca/flores_ca-gl.yaml |  1 -
 .../catalan_bench/flores_ca/flores_ca-it.yaml |  1 -
 .../catalan_bench/flores_ca/flores_ca-pt.yaml |  1 -
 .../catalan_bench/flores_ca/flores_ca.yaml    | 23 +++++++++++++++++++
 .../catalan_bench/flores_ca/flores_de-ca.yaml |  1 -
 .../catalan_bench/flores_ca/flores_en-ca.yaml |  1 -
 .../catalan_bench/flores_ca/flores_es-ca.yaml |  1 -
 .../catalan_bench/flores_ca/flores_eu-ca.yaml |  1 -
 .../catalan_bench/flores_ca/flores_fr-ca.yaml |  1 -
 .../catalan_bench/flores_ca/flores_gl-ca.yaml |  1 -
 .../catalan_bench/flores_ca/flores_it-ca.yaml |  1 -
 .../catalan_bench/flores_ca/flores_pt-ca.yaml |  1 -
 19 files changed, 26 insertions(+), 17 deletions(-)
 create mode 100644 lm_eval/tasks/catalan_bench/flores_ca/flores_ca.yaml

diff --git a/lm_eval/tasks/catalan_bench/flores_ca/_flores_common_yaml b/lm_eval/tasks/catalan_bench/flores_ca/_flores_common_yaml
index 075a66b5f4..59a9b14aaf 100644
--- a/lm_eval/tasks/catalan_bench/flores_ca/_flores_common_yaml
+++ b/lm_eval/tasks/catalan_bench/flores_ca/_flores_common_yaml
@@ -21,3 +21,5 @@ metric_list:
     higher_is_better: true
 metadata:
   version: 1.0
+dataset_kwargs:
+  trust_remote_code: true
diff --git a/lm_eval/tasks/catalan_bench/flores_ca/create-yamls_flores_ca.py b/lm_eval/tasks/catalan_bench/flores_ca/create-yamls_flores_ca.py
index eeadc3ed76..b83295f445 100644
--- a/lm_eval/tasks/catalan_bench/flores_ca/create-yamls_flores_ca.py
+++ b/lm_eval/tasks/catalan_bench/flores_ca/create-yamls_flores_ca.py
@@ -82,7 +82,7 @@ def gen_lang_yamls(output_dir: str, overwrite: bool) -> None:
                     yaml.dump(
                         {
 #                             "group": [f"{BENCH_NAME}_bench", f"{BENCH_NAME}_bench_flores"],
-                            "group": "flores_ca",
+#                            "group": "flores_ca",
                             "include": "_flores_common_yaml",
                             "task": f"flores_{lang_pair_name}",
                             "doc_to_text": doc_to_text(src, tgt),
diff --git a/lm_eval/tasks/catalan_bench/flores_ca/flores_ca-de.yaml b/lm_eval/tasks/catalan_bench/flores_ca/flores_ca-de.yaml
index 0bb415a04b..15eb02afb6 100644
--- a/lm_eval/tasks/catalan_bench/flores_ca/flores_ca-de.yaml
+++ b/lm_eval/tasks/catalan_bench/flores_ca/flores_ca-de.yaml
@@ -1,5 +1,4 @@
 # File generated by `create-yamls.py`
-group: flores_ca
 include: _flores_common_yaml
 task: flores_ca-de
 doc_to_text: 'Catalan sentence: {{sentence_cat_Latn}}
diff --git a/lm_eval/tasks/catalan_bench/flores_ca/flores_ca-en.yaml b/lm_eval/tasks/catalan_bench/flores_ca/flores_ca-en.yaml
index 3aa1351307..9a8f5ffeb8 100644
--- a/lm_eval/tasks/catalan_bench/flores_ca/flores_ca-en.yaml
+++ b/lm_eval/tasks/catalan_bench/flores_ca/flores_ca-en.yaml
@@ -1,5 +1,4 @@
 # File generated by `create-yamls.py`
-group: flores_ca
 include: _flores_common_yaml
 task: flores_ca-en
 doc_to_text: 'Catalan sentence: {{sentence_cat_Latn}}
diff --git a/lm_eval/tasks/catalan_bench/flores_ca/flores_ca-es.yaml b/lm_eval/tasks/catalan_bench/flores_ca/flores_ca-es.yaml
index 1b0cf7c7f0..9a6aa44240 100644
--- a/lm_eval/tasks/catalan_bench/flores_ca/flores_ca-es.yaml
+++ b/lm_eval/tasks/catalan_bench/flores_ca/flores_ca-es.yaml
@@ -1,5 +1,4 @@
 # File generated by `create-yamls.py`
-group: flores_ca
 include: _flores_common_yaml
 task: flores_ca-es
 doc_to_text: 'Catalan sentence: {{sentence_cat_Latn}}
diff --git a/lm_eval/tasks/catalan_bench/flores_ca/flores_ca-eu.yaml b/lm_eval/tasks/catalan_bench/flores_ca/flores_ca-eu.yaml
index fd540c6425..48ffe7bf5c 100644
--- a/lm_eval/tasks/catalan_bench/flores_ca/flores_ca-eu.yaml
+++ b/lm_eval/tasks/catalan_bench/flores_ca/flores_ca-eu.yaml
@@ -1,5 +1,4 @@
 # File generated by `create-yamls.py`
-group: flores_ca
 include: _flores_common_yaml
 task: flores_ca-eu
 doc_to_text: 'Catalan sentence: {{sentence_cat_Latn}}
diff --git a/lm_eval/tasks/catalan_bench/flores_ca/flores_ca-fr.yaml b/lm_eval/tasks/catalan_bench/flores_ca/flores_ca-fr.yaml
index 5aa495781d..99b40c1462 100644
--- a/lm_eval/tasks/catalan_bench/flores_ca/flores_ca-fr.yaml
+++ b/lm_eval/tasks/catalan_bench/flores_ca/flores_ca-fr.yaml
@@ -1,5 +1,4 @@
 # File generated by `create-yamls.py`
-group: flores_ca
 include: _flores_common_yaml
 task: flores_ca-fr
 doc_to_text: 'Catalan sentence: {{sentence_cat_Latn}}
diff --git a/lm_eval/tasks/catalan_bench/flores_ca/flores_ca-gl.yaml b/lm_eval/tasks/catalan_bench/flores_ca/flores_ca-gl.yaml
index d33cdd505e..5da7ad5fe4 100644
--- a/lm_eval/tasks/catalan_bench/flores_ca/flores_ca-gl.yaml
+++ b/lm_eval/tasks/catalan_bench/flores_ca/flores_ca-gl.yaml
@@ -1,5 +1,4 @@
 # File generated by `create-yamls.py`
-group: flores_ca
 include: _flores_common_yaml
 task: flores_ca-gl
 doc_to_text: 'Catalan sentence: {{sentence_cat_Latn}}
diff --git a/lm_eval/tasks/catalan_bench/flores_ca/flores_ca-it.yaml b/lm_eval/tasks/catalan_bench/flores_ca/flores_ca-it.yaml
index 61431e8da2..20f8d99f9f 100644
--- a/lm_eval/tasks/catalan_bench/flores_ca/flores_ca-it.yaml
+++ b/lm_eval/tasks/catalan_bench/flores_ca/flores_ca-it.yaml
@@ -1,5 +1,4 @@
 # File generated by `create-yamls.py`
-group: flores_ca
 include: _flores_common_yaml
 task: flores_ca-it
 doc_to_text: 'Catalan sentence: {{sentence_cat_Latn}}
diff --git a/lm_eval/tasks/catalan_bench/flores_ca/flores_ca-pt.yaml b/lm_eval/tasks/catalan_bench/flores_ca/flores_ca-pt.yaml
index 3b0e70b550..565f6267c5 100644
--- a/lm_eval/tasks/catalan_bench/flores_ca/flores_ca-pt.yaml
+++ b/lm_eval/tasks/catalan_bench/flores_ca/flores_ca-pt.yaml
@@ -1,5 +1,4 @@
 # File generated by `create-yamls.py`
-group: flores_ca
 include: _flores_common_yaml
 task: flores_ca-pt
 doc_to_text: 'Catalan sentence: {{sentence_cat_Latn}}
diff --git a/lm_eval/tasks/catalan_bench/flores_ca/flores_ca.yaml b/lm_eval/tasks/catalan_bench/flores_ca/flores_ca.yaml
new file mode 100644
index 0000000000..9bc682eb5c
--- /dev/null
+++ b/lm_eval/tasks/catalan_bench/flores_ca/flores_ca.yaml
@@ -0,0 +1,23 @@
+group: flores_ca
+task:
+  - flores_es-ca
+  - flores_ca-es
+  - flores_en-ca
+  - flores_ca-en
+  - flores_eu-ca
+  - flores_ca-eu
+  - flores_pt-ca
+  - flores_ca-pt
+  - flores_it-ca
+  - flores_ca-it
+  - flores_fr-ca
+  - flores_ca-fr
+  - flores_ca-gl
+  - flores_gl-ca
+  - flores_ca-de
+  - flores_de-ca
+aggregate_metric_list:
+  - metric: bleu
+    aggregation: mean
+metadata:
+  version: 1.0
diff --git a/lm_eval/tasks/catalan_bench/flores_ca/flores_de-ca.yaml b/lm_eval/tasks/catalan_bench/flores_ca/flores_de-ca.yaml
index 363bd62ced..af3d0eb493 100644
--- a/lm_eval/tasks/catalan_bench/flores_ca/flores_de-ca.yaml
+++ b/lm_eval/tasks/catalan_bench/flores_ca/flores_de-ca.yaml
@@ -1,5 +1,4 @@
 # File generated by `create-yamls.py`
-group: flores_ca
 include: _flores_common_yaml
 task: flores_de-ca
 doc_to_text: 'German sentence: {{sentence_deu_Latn}}
diff --git a/lm_eval/tasks/catalan_bench/flores_ca/flores_en-ca.yaml b/lm_eval/tasks/catalan_bench/flores_ca/flores_en-ca.yaml
index 81706f6dc2..16132ff497 100644
--- a/lm_eval/tasks/catalan_bench/flores_ca/flores_en-ca.yaml
+++ b/lm_eval/tasks/catalan_bench/flores_ca/flores_en-ca.yaml
@@ -1,5 +1,4 @@
 # File generated by `create-yamls.py`
-group: flores_ca
 include: _flores_common_yaml
 task: flores_en-ca
 doc_to_text: 'English sentence: {{sentence_eng_Latn}}
diff --git a/lm_eval/tasks/catalan_bench/flores_ca/flores_es-ca.yaml b/lm_eval/tasks/catalan_bench/flores_ca/flores_es-ca.yaml
index a74437e392..e35b715213 100644
--- a/lm_eval/tasks/catalan_bench/flores_ca/flores_es-ca.yaml
+++ b/lm_eval/tasks/catalan_bench/flores_ca/flores_es-ca.yaml
@@ -1,5 +1,4 @@
 # File generated by `create-yamls.py`
-group: flores_ca
 include: _flores_common_yaml
 task: flores_es-ca
 doc_to_text: 'Spanish sentence: {{sentence_spa_Latn}}
diff --git a/lm_eval/tasks/catalan_bench/flores_ca/flores_eu-ca.yaml b/lm_eval/tasks/catalan_bench/flores_ca/flores_eu-ca.yaml
index dcf6e3760f..c8be6ee93b 100644
--- a/lm_eval/tasks/catalan_bench/flores_ca/flores_eu-ca.yaml
+++ b/lm_eval/tasks/catalan_bench/flores_ca/flores_eu-ca.yaml
@@ -1,5 +1,4 @@
 # File generated by `create-yamls.py`
-group: flores_ca
 include: _flores_common_yaml
 task: flores_eu-ca
 doc_to_text: 'Basque sentence: {{sentence_eus_Latn}}
diff --git a/lm_eval/tasks/catalan_bench/flores_ca/flores_fr-ca.yaml b/lm_eval/tasks/catalan_bench/flores_ca/flores_fr-ca.yaml
index a5abc7ead8..0d2de77edf 100644
--- a/lm_eval/tasks/catalan_bench/flores_ca/flores_fr-ca.yaml
+++ b/lm_eval/tasks/catalan_bench/flores_ca/flores_fr-ca.yaml
@@ -1,5 +1,4 @@
 # File generated by `create-yamls.py`
-group: flores_ca
 include: _flores_common_yaml
 task: flores_fr-ca
 doc_to_text: 'French sentence: {{sentence_fra_Latn}}
diff --git a/lm_eval/tasks/catalan_bench/flores_ca/flores_gl-ca.yaml b/lm_eval/tasks/catalan_bench/flores_ca/flores_gl-ca.yaml
index 78c554086f..6ce3eaae5c 100644
--- a/lm_eval/tasks/catalan_bench/flores_ca/flores_gl-ca.yaml
+++ b/lm_eval/tasks/catalan_bench/flores_ca/flores_gl-ca.yaml
@@ -1,5 +1,4 @@
 # File generated by `create-yamls.py`
-group: flores_ca
 include: _flores_common_yaml
 task: flores_gl-ca
 doc_to_text: 'Galician sentence: {{sentence_glg_Latn}}
diff --git a/lm_eval/tasks/catalan_bench/flores_ca/flores_it-ca.yaml b/lm_eval/tasks/catalan_bench/flores_ca/flores_it-ca.yaml
index 128834d976..db811154e5 100644
--- a/lm_eval/tasks/catalan_bench/flores_ca/flores_it-ca.yaml
+++ b/lm_eval/tasks/catalan_bench/flores_ca/flores_it-ca.yaml
@@ -1,5 +1,4 @@
 # File generated by `create-yamls.py`
-group: flores_ca
 include: _flores_common_yaml
 task: flores_it-ca
 doc_to_text: 'Italian sentence: {{sentence_ita_Latn}}
diff --git a/lm_eval/tasks/catalan_bench/flores_ca/flores_pt-ca.yaml b/lm_eval/tasks/catalan_bench/flores_ca/flores_pt-ca.yaml
index dd355b797c..196295c9e3 100644
--- a/lm_eval/tasks/catalan_bench/flores_ca/flores_pt-ca.yaml
+++ b/lm_eval/tasks/catalan_bench/flores_ca/flores_pt-ca.yaml
@@ -1,5 +1,4 @@
 # File generated by `create-yamls.py`
-group: flores_ca
 include: _flores_common_yaml
 task: flores_pt-ca
 doc_to_text: 'Portuguese sentence: {{sentence_por_Latn}}

From fb8f0c1b0f3de0faf3bc8602b69c4c23a6f19b20 Mon Sep 17 00:00:00 2001
From: zxcvuser <irene.bauce@gmail.com>
Date: Fri, 27 Sep 2024 18:08:33 +0200
Subject: [PATCH 3/4] Updated some task groupings and readme

---
 lm_eval/tasks/README.md                       |  1 +
 lm_eval/tasks/catalan_bench/README.md         |  8 +--
 .../tasks/catalan_bench/_arc_ca_common_yaml   |  3 +-
 .../tasks/catalan_bench/_cabreu_common_yaml   |  2 +-
 lm_eval/tasks/catalan_bench/catalanqa.yaml    |  2 +-
 lm_eval/tasks/catalan_bench/copa_ca.yaml      |  2 +-
 lm_eval/tasks/catalan_bench/coqcat.yaml       |  2 +-
 ...flores_ca.py => create_yamls_flores_ca.py} |  0
 .../catalan_bench/flores_ca/flores_ca.yaml    |  1 +
 .../tasks/catalan_bench/mgsm_direct_ca.yaml   |  2 -
 .../tasks/catalan_bench/openbookqa_ca.yaml    |  2 +-
 lm_eval/tasks/catalan_bench/parafraseja.yaml  |  2 +-
 lm_eval/tasks/catalan_bench/paws_ca.yaml      |  4 +-
 .../phrases_va/_phrases_va_common.yaml        |  2 +-
 lm_eval/tasks/catalan_bench/siqa_ca.yaml      |  2 +-
 lm_eval/tasks/catalan_bench/teca.yaml         |  2 +-
 lm_eval/tasks/catalan_bench/utils.py          | 66 ++++++++++++-------
 lm_eval/tasks/catalan_bench/wnli_ca.yaml      |  2 +-
 lm_eval/tasks/catalan_bench/xnli_ca.yaml      |  4 +-
 lm_eval/tasks/catalan_bench/xquad_ca.yaml     |  3 +-
 .../tasks/catalan_bench/xstorycloze_ca.yaml   |  2 +-
 21 files changed, 63 insertions(+), 51 deletions(-)
 rename lm_eval/tasks/catalan_bench/flores_ca/{create-yamls_flores_ca.py => create_yamls_flores_ca.py} (100%)

diff --git a/lm_eval/tasks/README.md b/lm_eval/tasks/README.md
index 3903db98e8..de68b320bb 100644
--- a/lm_eval/tasks/README.md
+++ b/lm_eval/tasks/README.md
@@ -121,3 +121,4 @@
 | [xnli_eu](xnli_eu/README.md) | Cross-lingual Natural Language Inference tasks in Basque. | Basque |
 | [xstorycloze](xstorycloze/README.md) | Cross-lingual narrative understanding tasks to predict story endings in multiple languages. | Russian, Simplified Chinese, Spanish, Arabic, Hindi, Indonesian, Telugu, Swahili, Basque, Burmese |
 | [xwinograd](xwinograd/README.md) | Cross-lingual Winograd schema tasks for coreference resolution in multiple languages. | English, French, Japanese, Portuguese, Russian, Chinese |
+| [catalan_bench](catalan_bench/README.md) | Collection of tasks in Catalan encompassing various evaluation areas. | Catalan |
diff --git a/lm_eval/tasks/catalan_bench/README.md b/lm_eval/tasks/catalan_bench/README.md
index 32b1b0fc34..73dec948fe 100644
--- a/lm_eval/tasks/catalan_bench/README.md
+++ b/lm_eval/tasks/catalan_bench/README.md
@@ -54,10 +54,11 @@ Paper for CatalanBench coming soon.
 #### Groups
 
 - `catalan_bench`: All tasks included in CatalanBench.
-- `cabreu`: Three CaBREU tasks for each type of summary (extractive, abstractive and extreme).
 - `flores_ca`: All FLORES translation tasks from or to Catalan.
-- `phrases_ca`: Two Phrases_va tasks for language adaptation between Catalan and Valencian.
 
+#### Tags
+- `cabreu`: Three CaBREU tasks for each type of summary (extractive, abstractive and extreme).
+- `phrases_va`: Two Phrases_va tasks for language adaptation between Catalan and Valencian.
 
 #### Tasks
 
@@ -105,9 +106,6 @@ The following tasks evaluate tasks on CatalanBench dataset using various scoring
 
 Some of these tasks are taken from benchmarks already available in LM Evaluation Harness. These are:
 - `belebele_cat_Latn`: Belebele Catalan
-- `veritasqa_gen_ca`: VeritasQA Catalan
-- `veritasqa_mc1_ca`: VeritasQA Catalan
-- `veritasqa_mc2_ca`: VeritasQA Catalan
 
 
 ### Checklist
diff --git a/lm_eval/tasks/catalan_bench/_arc_ca_common_yaml b/lm_eval/tasks/catalan_bench/_arc_ca_common_yaml
index 38dfc08b46..b89290ebaf 100644
--- a/lm_eval/tasks/catalan_bench/_arc_ca_common_yaml
+++ b/lm_eval/tasks/catalan_bench/_arc_ca_common_yaml
@@ -1,5 +1,4 @@
-group:
-  - ai2_arc
+tag: arc_ca
 dataset_path: projecte-aina/arc_ca
 output_type: multiple_choice
 training_split: null
diff --git a/lm_eval/tasks/catalan_bench/_cabreu_common_yaml b/lm_eval/tasks/catalan_bench/_cabreu_common_yaml
index 85c51a18b6..c66e8bc486 100644
--- a/lm_eval/tasks/catalan_bench/_cabreu_common_yaml
+++ b/lm_eval/tasks/catalan_bench/_cabreu_common_yaml
@@ -1,4 +1,4 @@
-group: cabreu
+tag: cabreu
 dataset_path: projecte-aina/caBreu
 dataset_name: null
 output_type: generate_until
diff --git a/lm_eval/tasks/catalan_bench/catalanqa.yaml b/lm_eval/tasks/catalan_bench/catalanqa.yaml
index 8861794e94..926cdfa1be 100644
--- a/lm_eval/tasks/catalan_bench/catalanqa.yaml
+++ b/lm_eval/tasks/catalan_bench/catalanqa.yaml
@@ -22,4 +22,4 @@ metric_list:
     aggregation: mean
     higher_is_better: true
 metadata:
-  version: 1.0
\ No newline at end of file
+  version: 1.0
diff --git a/lm_eval/tasks/catalan_bench/copa_ca.yaml b/lm_eval/tasks/catalan_bench/copa_ca.yaml
index c7ca3f11a1..d376ad3aea 100644
--- a/lm_eval/tasks/catalan_bench/copa_ca.yaml
+++ b/lm_eval/tasks/catalan_bench/copa_ca.yaml
@@ -14,4 +14,4 @@ metric_list:
     aggregation: mean
     higher_is_better: true
 metadata:
-  version: 1.0
\ No newline at end of file
+  version: 1.0
diff --git a/lm_eval/tasks/catalan_bench/coqcat.yaml b/lm_eval/tasks/catalan_bench/coqcat.yaml
index 12ab7d7cdf..95145a7492 100644
--- a/lm_eval/tasks/catalan_bench/coqcat.yaml
+++ b/lm_eval/tasks/catalan_bench/coqcat.yaml
@@ -20,4 +20,4 @@ metric_list:
     aggregation: mean
     higher_is_better: true
 metadata:
-  version: 1.0
\ No newline at end of file
+  version: 1.0
diff --git a/lm_eval/tasks/catalan_bench/flores_ca/create-yamls_flores_ca.py b/lm_eval/tasks/catalan_bench/flores_ca/create_yamls_flores_ca.py
similarity index 100%
rename from lm_eval/tasks/catalan_bench/flores_ca/create-yamls_flores_ca.py
rename to lm_eval/tasks/catalan_bench/flores_ca/create_yamls_flores_ca.py
diff --git a/lm_eval/tasks/catalan_bench/flores_ca/flores_ca.yaml b/lm_eval/tasks/catalan_bench/flores_ca/flores_ca.yaml
index 9bc682eb5c..4726daa83e 100644
--- a/lm_eval/tasks/catalan_bench/flores_ca/flores_ca.yaml
+++ b/lm_eval/tasks/catalan_bench/flores_ca/flores_ca.yaml
@@ -19,5 +19,6 @@ task:
 aggregate_metric_list:
   - metric: bleu
     aggregation: mean
+    weight_by_size: false
 metadata:
   version: 1.0
diff --git a/lm_eval/tasks/catalan_bench/mgsm_direct_ca.yaml b/lm_eval/tasks/catalan_bench/mgsm_direct_ca.yaml
index 754d5b91ce..066336a67f 100644
--- a/lm_eval/tasks/catalan_bench/mgsm_direct_ca.yaml
+++ b/lm_eval/tasks/catalan_bench/mgsm_direct_ca.yaml
@@ -1,5 +1,3 @@
-group:
-    - mgsm_direct
 task: mgsm_direct_ca
 dataset_path: projecte-aina/mgsm_ca
 doc_to_target: '{{answer_number|string}}'
diff --git a/lm_eval/tasks/catalan_bench/openbookqa_ca.yaml b/lm_eval/tasks/catalan_bench/openbookqa_ca.yaml
index 00a1f03d4f..868be75612 100644
--- a/lm_eval/tasks/catalan_bench/openbookqa_ca.yaml
+++ b/lm_eval/tasks/catalan_bench/openbookqa_ca.yaml
@@ -17,4 +17,4 @@ metric_list:
     aggregation: mean
     higher_is_better: true
 metadata:
-  version: 1.0
\ No newline at end of file
+  version: 1.0
diff --git a/lm_eval/tasks/catalan_bench/parafraseja.yaml b/lm_eval/tasks/catalan_bench/parafraseja.yaml
index 208e3e373f..060d488d18 100644
--- a/lm_eval/tasks/catalan_bench/parafraseja.yaml
+++ b/lm_eval/tasks/catalan_bench/parafraseja.yaml
@@ -14,4 +14,4 @@ metric_list:
     aggregation: mean
     higher_is_better: true
 metadata:
-  version: 1.0
\ No newline at end of file
+  version: 1.0
diff --git a/lm_eval/tasks/catalan_bench/paws_ca.yaml b/lm_eval/tasks/catalan_bench/paws_ca.yaml
index c9fbd04f9a..e736f5c746 100644
--- a/lm_eval/tasks/catalan_bench/paws_ca.yaml
+++ b/lm_eval/tasks/catalan_bench/paws_ca.yaml
@@ -1,5 +1,3 @@
-group:
-    - pawsx
 task: paws_ca
 dataset_path: projecte-aina/PAWS-ca
 dataset_name: null
@@ -17,4 +15,4 @@ metric_list:
     aggregation: mean
     higher_is_better: true
 metadata:
-  version: 1.0
\ No newline at end of file
+  version: 1.0
diff --git a/lm_eval/tasks/catalan_bench/phrases_va/_phrases_va_common.yaml b/lm_eval/tasks/catalan_bench/phrases_va/_phrases_va_common.yaml
index 48232725b2..f59a2098ca 100644
--- a/lm_eval/tasks/catalan_bench/phrases_va/_phrases_va_common.yaml
+++ b/lm_eval/tasks/catalan_bench/phrases_va/_phrases_va_common.yaml
@@ -1,4 +1,4 @@
-group: phrases_va
+tag: phrases_va
 dataset_path: gplsi/CA-VA_alignment_test
 output_type: generate_until
 training_split: null
diff --git a/lm_eval/tasks/catalan_bench/siqa_ca.yaml b/lm_eval/tasks/catalan_bench/siqa_ca.yaml
index 01f0651b7c..8a39a37f5c 100644
--- a/lm_eval/tasks/catalan_bench/siqa_ca.yaml
+++ b/lm_eval/tasks/catalan_bench/siqa_ca.yaml
@@ -13,4 +13,4 @@ metric_list:
     aggregation: mean
     higher_is_better: true
 metadata:
-  version: 1.0
\ No newline at end of file
+  version: 1.0
diff --git a/lm_eval/tasks/catalan_bench/teca.yaml b/lm_eval/tasks/catalan_bench/teca.yaml
index c60acbd559..8978c2c969 100644
--- a/lm_eval/tasks/catalan_bench/teca.yaml
+++ b/lm_eval/tasks/catalan_bench/teca.yaml
@@ -15,4 +15,4 @@ metric_list:
     aggregation: mean
     higher_is_better: true
 metadata:
-  version: 1.0
\ No newline at end of file
+  version: 1.0
diff --git a/lm_eval/tasks/catalan_bench/utils.py b/lm_eval/tasks/catalan_bench/utils.py
index 650749dc59..ced91772ca 100644
--- a/lm_eval/tasks/catalan_bench/utils.py
+++ b/lm_eval/tasks/catalan_bench/utils.py
@@ -1,25 +1,37 @@
 import re
 from itertools import product
+
 import evaluate
 import transformers.data.metrics.squad_metrics as squad_metrics
+
 from lm_eval.utils import general_detokenize
 
 
 def lowercase_first_letter(text):
-	return text[0].lower() + text[1:]
+    return text[0].lower() + text[1:]
+
 
 def process_doc_nli(dataset):
     def process_fn(doc):
         # Detokenize(remove extra whitespaces)
         doc["premise"] = general_detokenize(doc["premise"]).strip()
         doc["hypothesis"] = general_detokenize(doc["hypothesis"]).strip()
-	    # Remove last punctuation mark in the premise
-        doc["premise"] = doc["premise"][:-1] if doc["premise"].endswith((".", ",", "!", "?")) else doc["premise"]
-	    # Lowercase the first letter in the hypothesis
+        # Remove last punctuation mark in the premise
+        doc["premise"] = (
+            doc["premise"][:-1]
+            if doc["premise"].endswith((".", ",", "!", "?"))
+            else doc["premise"]
+        )
+        # Lowercase the first letter in the hypothesis
         doc["hypothesis"] = lowercase_first_letter(doc["hypothesis"])
-	    # Ensure that the hypothesis ends with a dot
-        doc["hypothesis"] = (doc["hypothesis"] + ".") if not doc["hypothesis"].endswith(".") else doc["hypothesis"]
+        # Ensure that the hypothesis ends with a dot
+        doc["hypothesis"] = (
+            (doc["hypothesis"] + ".")
+            if not doc["hypothesis"].endswith(".")
+            else doc["hypothesis"]
+        )
         return doc
+
     return dataset.map(process_fn)
 
 
@@ -30,53 +42,60 @@ def process_results_coqcat(doc, results):
     additional_answers_list = doc.get("additional_answers")
     if additional_answers_list:
         for key, additional_answers in additional_answers_list.items():
-            if additional_answers["input_text"][turn_id - 1].lower() not in map(str.lower, answers):
+            if additional_answers["input_text"][turn_id - 1].lower() not in map(
+                str.lower, answers
+            ):
                 answers.append(additional_answers["input_text"][turn_id - 1])
 
     gold_list = answers
     pred = results[0].strip().split("\n")[0]
-    #import code; code.interact(local=dict(globals(), **locals())) 
-    
+    # import code; code.interact(local=dict(globals(), **locals()))
+
     f1_sum = 0.0
     em_sum = 0.0
     if len(gold_list) > 1:
         for i in range(len(gold_list)):
-            gold_answers = gold_list[0:i] + gold_list[i + 1:]
+            gold_answers = gold_list[0:i] + gold_list[i + 1 :]
             # predictions compared against (n) golds and take maximum
             em_sum += max(squad_metrics.compute_exact(a, pred) for a in gold_answers)
             f1_sum += max(squad_metrics.compute_f1(a, pred) for a in gold_answers)
     else:
         em_sum += max(squad_metrics.compute_exact(a, pred) for a in gold_list)
         f1_sum += max(squad_metrics.compute_f1(a, pred) for a in gold_list)
-    #import code; code.interact(local=dict(globals(), **locals()))
+    # import code; code.interact(local=dict(globals(), **locals()))
     return {
         "em": em_sum / max(1, len(gold_list)),
         "f1": f1_sum / max(1, len(gold_list)),
     }
 
+
 def process_results_qa(doc, results):
     preds = results[0]
     reference = doc["answers"][0]["text"]
-    #import code; code.interact(local=dict(globals(), **locals()))
+    # import code; code.interact(local=dict(globals(), **locals()))
     f1_sum = squad_metrics.compute_f1(reference, preds)
     exact_match = squad_metrics.compute_exact(reference, preds)
-    return {
-    	"f1": f1_sum,
-    	"exact_match": exact_match
-    }	
+    return {"f1": f1_sum, "exact_match": exact_match}
+
 
 def process_doc_cabreu(dataset):
     def process_fn(doc):
         # Remove duplicate spaces
         doc["content"] = re.sub(r" +", " ", doc["content"])
-        for summary_type, index in product(["abstractive", "extractive", "extreme"], ["a1", "a2", "a3"]):
-            doc["summaries"][summary_type][index] = re.sub(r" +", " ", doc["summaries"][summary_type][index])
+        for summary_type, index in product(
+            ["abstractive", "extractive", "extreme"], ["a1", "a2", "a3"]
+        ):
+            doc["summaries"][summary_type][index] = re.sub(
+                r" +", " ", doc["summaries"][summary_type][index]
+            )
         return doc
 
     return dataset.map(process_fn)
 
+
 def process_docs_paraphrases(dataset):
     empty_docs = []
+
     def _process_doc(doc):
         if doc["sentence1"] not in [None, ""] and doc["sentence2"] not in [None, ""]:
             doc["sentence1"] = general_detokenize(doc["sentence1"]).strip()
@@ -90,10 +109,11 @@ def _process_doc(doc):
         else:
             empty_docs.append(doc)
             return doc
-    if empty_docs != []:
-        len_empty_docs = len(empty_docs)
-        print(f"Found {len_empty_docs} empty documents out of the {len(dataset)} total docs in the dataset: {empty_docs}")
-    return dataset.filter(lambda doc: doc["sentence1"] not in [None, ""] and doc["sentence2"] not in [None, ""]).map(_process_doc)
+
+    return dataset.filter(
+        lambda doc: doc["sentence1"] not in [None, ""]
+        and doc["sentence2"] not in [None, ""]
+    ).map(_process_doc)
 
 
 def process_docs_copa_ca(dataset):
@@ -101,6 +121,7 @@ def _process_doc(doc):
         doc["choice1"] = lowercase_first_letter(doc["choice1"])
         doc["choice2"] = lowercase_first_letter(doc["choice2"])
         return doc
+
     return dataset.map(_process_doc)
 
 
@@ -119,4 +140,3 @@ def rouge1_agg(items):
     preds = list(zip(*items))[1]
     rouge_scorer = evaluate.load("rouge")
     return rouge_scorer.compute(predictions=preds, references=refs)["rouge1"]
-
diff --git a/lm_eval/tasks/catalan_bench/wnli_ca.yaml b/lm_eval/tasks/catalan_bench/wnli_ca.yaml
index ba9d8b1e8a..d4deec5c04 100644
--- a/lm_eval/tasks/catalan_bench/wnli_ca.yaml
+++ b/lm_eval/tasks/catalan_bench/wnli_ca.yaml
@@ -11,4 +11,4 @@ doc_to_choice: ["Fals", "Cert"]
 metric_list:
   - metric: acc
 metadata:
-  version: 1.0
\ No newline at end of file
+  version: 1.0
diff --git a/lm_eval/tasks/catalan_bench/xnli_ca.yaml b/lm_eval/tasks/catalan_bench/xnli_ca.yaml
index 959b4775d1..44f0f44302 100644
--- a/lm_eval/tasks/catalan_bench/xnli_ca.yaml
+++ b/lm_eval/tasks/catalan_bench/xnli_ca.yaml
@@ -1,5 +1,3 @@
-group:
-    - xnli
 task: xnli_ca
 dataset_path: projecte-aina/xnli-ca
 dataset_name: null
@@ -18,4 +16,4 @@ metric_list:
     aggregation: mean
     higher_is_better: true
 metadata:
-  version: 1.0
\ No newline at end of file
+  version: 1.0
diff --git a/lm_eval/tasks/catalan_bench/xquad_ca.yaml b/lm_eval/tasks/catalan_bench/xquad_ca.yaml
index e70a59a978..9b72c7da74 100644
--- a/lm_eval/tasks/catalan_bench/xquad_ca.yaml
+++ b/lm_eval/tasks/catalan_bench/xquad_ca.yaml
@@ -8,7 +8,6 @@ validation_split: null
 test_split: test
 target_delimiter: ' '
 process_results: !function utils.process_results_qa
-test_split: test
 generation_kwargs:
   until:
     - "\n"
@@ -22,4 +21,4 @@ metric_list:
     aggregation: mean
     higher_is_better: true
 metadata:
-  version: 1.0
\ No newline at end of file
+  version: 1.0
diff --git a/lm_eval/tasks/catalan_bench/xstorycloze_ca.yaml b/lm_eval/tasks/catalan_bench/xstorycloze_ca.yaml
index 11491d9267..61a7c2991f 100644
--- a/lm_eval/tasks/catalan_bench/xstorycloze_ca.yaml
+++ b/lm_eval/tasks/catalan_bench/xstorycloze_ca.yaml
@@ -14,4 +14,4 @@ metric_list:
     aggregation: mean
     higher_is_better: true
 metadata:
-  version: 1.0
\ No newline at end of file
+  version: 1.0

From 43c9a497b9805ec48e1593aee88c1a8d14a8a096 Mon Sep 17 00:00:00 2001
From: zxcvuser <irene.bauce@gmail.com>
Date: Mon, 30 Sep 2024 17:30:45 +0200
Subject: [PATCH 4/4] Fix create_yamls_flores_ca.py

---
 .../flores_ca/create_yamls_flores_ca.py       | 299 +++++++++++++++---
 1 file changed, 259 insertions(+), 40 deletions(-)

diff --git a/lm_eval/tasks/catalan_bench/flores_ca/create_yamls_flores_ca.py b/lm_eval/tasks/catalan_bench/flores_ca/create_yamls_flores_ca.py
index b83295f445..6125b97266 100644
--- a/lm_eval/tasks/catalan_bench/flores_ca/create_yamls_flores_ca.py
+++ b/lm_eval/tasks/catalan_bench/flores_ca/create_yamls_flores_ca.py
@@ -4,50 +4,256 @@
 """
 
 import argparse
+
 import yaml
-from langcodes import *
-from itertools import *
+from langcodes import Language
 
-# utils
-flatten = lambda l: list(itertools.chain(*l))
 
 # constants
 _LANGUAGES = [
-"ace_Arab",  "bam_Latn",  "dzo_Tibt",  "hin_Deva",	"khm_Khmr",  "mag_Deva",  "pap_Latn",  "sot_Latn",	"tur_Latn",
-"ace_Latn",  "ban_Latn",  "ell_Grek",  "hne_Deva",	"kik_Latn",  "mai_Deva",  "pbt_Arab",  "spa_Latn",	"twi_Latn",
-"acm_Arab",  "bel_Cyrl",  "eng_Latn",  "hrv_Latn",	"kin_Latn",  "mal_Mlym",  "pes_Arab",  "srd_Latn",	"tzm_Tfng",
-"acq_Arab",  "bem_Latn",  "epo_Latn",  "hun_Latn",	"kir_Cyrl",  "mar_Deva",  "plt_Latn",  "srp_Cyrl",	"uig_Arab",
-"aeb_Arab",  "ben_Beng",  "est_Latn",  "hye_Armn",	"kmb_Latn",  "min_Arab",  "pol_Latn",  "ssw_Latn",	"ukr_Cyrl",
-"afr_Latn",  "bho_Deva",  "eus_Latn",  "ibo_Latn",	"kmr_Latn",  "min_Latn",  "por_Latn",  "sun_Latn",	"umb_Latn",
-"ajp_Arab",  "bjn_Arab",  "ewe_Latn",  "ilo_Latn",	"knc_Arab",  "mkd_Cyrl",  "prs_Arab",  "swe_Latn",	"urd_Arab",
-"aka_Latn",  "bjn_Latn",  "fao_Latn",  "ind_Latn",	"knc_Latn",  "mlt_Latn",  "quy_Latn",  "swh_Latn",	"uzn_Latn",
-"als_Latn",  "bod_Tibt",  "fij_Latn",  "isl_Latn",	"kon_Latn",  "mni_Beng",  "ron_Latn",  "szl_Latn",	"vec_Latn",
-"amh_Ethi",  "bos_Latn",  "fin_Latn",  "ita_Latn",	"kor_Hang",  "mos_Latn",  "run_Latn",  "tam_Taml",	"vie_Latn",
-"apc_Arab",  "bug_Latn",  "fon_Latn",  "jav_Latn",	"lao_Laoo",  "mri_Latn",  "rus_Cyrl",  "taq_Latn",	"war_Latn",
-"arb_Arab",  "bul_Cyrl",  "fra_Latn",  "jpn_Jpan",	"lij_Latn",  "mya_Mymr",  "sag_Latn",  "taq_Tfng",	"wol_Latn",
-"arb_Latn",  "cat_Latn",  "fur_Latn",  "kab_Latn",	"lim_Latn",  "nld_Latn",  "san_Deva",  "tat_Cyrl",	"xho_Latn",
-"ars_Arab",  "ceb_Latn",  "fuv_Latn",  "kac_Latn",	"lin_Latn",  "nno_Latn",  "sat_Olck",  "tel_Telu",	"ydd_Hebr",
-"ary_Arab",  "ces_Latn",  "gaz_Latn",  "kam_Latn",	"lit_Latn",  "nob_Latn",  "scn_Latn",  "tgk_Cyrl",	"yor_Latn",
-"arz_Arab",  "cjk_Latn",  "gla_Latn",  "kan_Knda",	"lmo_Latn",  "npi_Deva",  "shn_Mymr",  "tgl_Latn",	"yue_Hant",
-"asm_Beng",  "ckb_Arab",  "gle_Latn",  "kas_Arab",	"ltg_Latn",  "nso_Latn",  "sin_Sinh",  "tha_Thai",	"zho_Hans",
-"ast_Latn",  "crh_Latn",  "glg_Latn",  "kas_Deva",	"ltz_Latn",  "nus_Latn",  "slk_Latn",  "tir_Ethi",	"zho_Hant",
-"awa_Deva",  "cym_Latn",  "grn_Latn",  "kat_Geor",	"lua_Latn",  "nya_Latn",  "slv_Latn",  "tpi_Latn",	"zsm_Latn",
-"ayr_Latn",  "dan_Latn",  "guj_Gujr",  "kaz_Cyrl",	"lug_Latn",  "oci_Latn",  "smo_Latn",  "tsn_Latn",	"zul_Latn",
-"azb_Arab",  "deu_Latn",  "hat_Latn",  "kbp_Latn",	"luo_Latn",  "ory_Orya",  "sna_Latn",  "tso_Latn",
-"azj_Latn",  "dik_Latn",  "hau_Latn",  "kea_Latn",	"lus_Latn",  "pag_Latn",  "snd_Arab",  "tuk_Latn",
-"bak_Cyrl",  "dyu_Latn",  "heb_Hebr",  "khk_Cyrl",	"lvs_Latn",  "pan_Guru",  "som_Latn",  "tum_Latn"
+    "ace_Arab",
+    "bam_Latn",
+    "dzo_Tibt",
+    "hin_Deva",
+    "khm_Khmr",
+    "mag_Deva",
+    "pap_Latn",
+    "sot_Latn",
+    "tur_Latn",
+    "ace_Latn",
+    "ban_Latn",
+    "ell_Grek",
+    "hne_Deva",
+    "kik_Latn",
+    "mai_Deva",
+    "pbt_Arab",
+    "spa_Latn",
+    "twi_Latn",
+    "acm_Arab",
+    "bel_Cyrl",
+    "eng_Latn",
+    "hrv_Latn",
+    "kin_Latn",
+    "mal_Mlym",
+    "pes_Arab",
+    "srd_Latn",
+    "tzm_Tfng",
+    "acq_Arab",
+    "bem_Latn",
+    "epo_Latn",
+    "hun_Latn",
+    "kir_Cyrl",
+    "mar_Deva",
+    "plt_Latn",
+    "srp_Cyrl",
+    "uig_Arab",
+    "aeb_Arab",
+    "ben_Beng",
+    "est_Latn",
+    "hye_Armn",
+    "kmb_Latn",
+    "min_Arab",
+    "pol_Latn",
+    "ssw_Latn",
+    "ukr_Cyrl",
+    "afr_Latn",
+    "bho_Deva",
+    "eus_Latn",
+    "ibo_Latn",
+    "kmr_Latn",
+    "min_Latn",
+    "por_Latn",
+    "sun_Latn",
+    "umb_Latn",
+    "ajp_Arab",
+    "bjn_Arab",
+    "ewe_Latn",
+    "ilo_Latn",
+    "knc_Arab",
+    "mkd_Cyrl",
+    "prs_Arab",
+    "swe_Latn",
+    "urd_Arab",
+    "aka_Latn",
+    "bjn_Latn",
+    "fao_Latn",
+    "ind_Latn",
+    "knc_Latn",
+    "mlt_Latn",
+    "quy_Latn",
+    "swh_Latn",
+    "uzn_Latn",
+    "als_Latn",
+    "bod_Tibt",
+    "fij_Latn",
+    "isl_Latn",
+    "kon_Latn",
+    "mni_Beng",
+    "ron_Latn",
+    "szl_Latn",
+    "vec_Latn",
+    "amh_Ethi",
+    "bos_Latn",
+    "fin_Latn",
+    "ita_Latn",
+    "kor_Hang",
+    "mos_Latn",
+    "run_Latn",
+    "tam_Taml",
+    "vie_Latn",
+    "apc_Arab",
+    "bug_Latn",
+    "fon_Latn",
+    "jav_Latn",
+    "lao_Laoo",
+    "mri_Latn",
+    "rus_Cyrl",
+    "taq_Latn",
+    "war_Latn",
+    "arb_Arab",
+    "bul_Cyrl",
+    "fra_Latn",
+    "jpn_Jpan",
+    "lij_Latn",
+    "mya_Mymr",
+    "sag_Latn",
+    "taq_Tfng",
+    "wol_Latn",
+    "arb_Latn",
+    "cat_Latn",
+    "fur_Latn",
+    "kab_Latn",
+    "lim_Latn",
+    "nld_Latn",
+    "san_Deva",
+    "tat_Cyrl",
+    "xho_Latn",
+    "ars_Arab",
+    "ceb_Latn",
+    "fuv_Latn",
+    "kac_Latn",
+    "lin_Latn",
+    "nno_Latn",
+    "sat_Olck",
+    "tel_Telu",
+    "ydd_Hebr",
+    "ary_Arab",
+    "ces_Latn",
+    "gaz_Latn",
+    "kam_Latn",
+    "lit_Latn",
+    "nob_Latn",
+    "scn_Latn",
+    "tgk_Cyrl",
+    "yor_Latn",
+    "arz_Arab",
+    "cjk_Latn",
+    "gla_Latn",
+    "kan_Knda",
+    "lmo_Latn",
+    "npi_Deva",
+    "shn_Mymr",
+    "tgl_Latn",
+    "yue_Hant",
+    "asm_Beng",
+    "ckb_Arab",
+    "gle_Latn",
+    "kas_Arab",
+    "ltg_Latn",
+    "nso_Latn",
+    "sin_Sinh",
+    "tha_Thai",
+    "zho_Hans",
+    "ast_Latn",
+    "crh_Latn",
+    "glg_Latn",
+    "kas_Deva",
+    "ltz_Latn",
+    "nus_Latn",
+    "slk_Latn",
+    "tir_Ethi",
+    "zho_Hant",
+    "awa_Deva",
+    "cym_Latn",
+    "grn_Latn",
+    "kat_Geor",
+    "lua_Latn",
+    "nya_Latn",
+    "slv_Latn",
+    "tpi_Latn",
+    "zsm_Latn",
+    "ayr_Latn",
+    "dan_Latn",
+    "guj_Gujr",
+    "kaz_Cyrl",
+    "lug_Latn",
+    "oci_Latn",
+    "smo_Latn",
+    "tsn_Latn",
+    "zul_Latn",
+    "azb_Arab",
+    "deu_Latn",
+    "hat_Latn",
+    "kbp_Latn",
+    "luo_Latn",
+    "ory_Orya",
+    "sna_Latn",
+    "tso_Latn",
+    "azj_Latn",
+    "dik_Latn",
+    "hau_Latn",
+    "kea_Latn",
+    "lus_Latn",
+    "pag_Latn",
+    "snd_Arab",
+    "tuk_Latn",
+    "bak_Cyrl",
+    "dyu_Latn",
+    "heb_Hebr",
+    "khk_Cyrl",
+    "lvs_Latn",
+    "pan_Guru",
+    "som_Latn",
+    "tum_Latn",
+]
+LANGUAGE_PAIRS = [
+    (a, b) for idx, a in enumerate(_LANGUAGES) for b in _LANGUAGES[idx + 1 :]
 ]
-LANGUAGE_PAIRS = [(a, b) for idx, a in enumerate(_LANGUAGES) for b in _LANGUAGES[idx + 1:]]
 
-LANGUAGES_OF_INTEREST = ["cat_Latn", "spa_Latn", "eng_Latn", "glg_Latn", "eus_Latn", "ita_Latn", "deu_Latn", "por_Latn", "fra_Latn"]
+LANGUAGES_OF_INTEREST = [
+    "cat_Latn",
+    "spa_Latn",
+    "eng_Latn",
+    "glg_Latn",
+    "eus_Latn",
+    "ita_Latn",
+    "deu_Latn",
+    "por_Latn",
+    "fra_Latn",
+]
 MAIN_LANG = "cat_Latn"
-LANGUAGE_PAIRS = [(a, b) for (a, b) in LANGUAGE_PAIRS if a in LANGUAGES_OF_INTEREST and b in LANGUAGES_OF_INTEREST and "cat_Latn" in (a, b)]
+LANGUAGE_PAIRS = [
+    (a, b)
+    for (a, b) in LANGUAGE_PAIRS
+    if a in LANGUAGES_OF_INTEREST
+    and b in LANGUAGES_OF_INTEREST
+    and "cat_Latn" in (a, b)
+]
 
 # auxiliary functions
 
-code_to_language_name = lambda code: Language.make(language=Language.get(code)["language"]).display_name()
-code_to_short_name = lambda code: Language.get(code)["language"]
-jinja_var = lambda s: "{{" + s + "}}" # wrapper to avoid having to escape { } in format strings
+
+def code_to_language_name(code):
+    return Language.make(language=Language.get(code)["language"]).display_name()
+
+
+def code_to_short_name(code):
+    return Language.get(code)["language"]
+
+
+def jinja_var(s):
+    return "{{" + s + "}}"
+
 
 def doc_to_text(src: str, tgt: str) -> str:
     src_name, tgt_name = map(code_to_language_name, [src, tgt])
@@ -56,12 +262,14 @@ def doc_to_text(src: str, tgt: str) -> str:
 {src_name} sentence: {jinja_var('sentence_' + src)}
 {tgt_name} sentence:"""
 
-def doc_to_target(tgt: str) -> str:
 
+def doc_to_target(tgt: str) -> str:
     return f"{jinja_var('sentence_' + tgt)}"
 
+
 # main function
 
+
 def gen_lang_yamls(output_dir: str, overwrite: bool) -> None:
     """
     Generate a YAML file for each translation direction.
@@ -69,20 +277,23 @@ def gen_lang_yamls(output_dir: str, overwrite: bool) -> None:
 
     err = []
     for src, tgt in LANGUAGE_PAIRS:
-
         # do both translation directions for each lang pair
         for src, tgt in [(src, tgt), (tgt, src)]:
             lang_pair_name = f"{code_to_short_name(src)}-{code_to_short_name(tgt)}"
             yaml_file_name = f"flores_{lang_pair_name}.yaml"
 
             try:
-                with open( f"{output_dir}/{yaml_file_name}", "w" if overwrite else "x", encoding="utf-8") as outfile:
+                with open(
+                    f"{output_dir}/{yaml_file_name}",
+                    "w" if overwrite else "x",
+                    encoding="utf-8",
+                ) as outfile:
                     print(f"Creating {yaml_file_name}...")
                     outfile.write("# File generated by `create-yamls.py`\n")
                     yaml.dump(
                         {
-#                             "group": [f"{BENCH_NAME}_bench", f"{BENCH_NAME}_bench_flores"],
-#                            "group": "flores_ca",
+                            #                             "group": [f"{BENCH_NAME}_bench", f"{BENCH_NAME}_bench_flores"],
+                            #                            "group": "flores_ca",
                             "include": "_flores_common_yaml",
                             "task": f"flores_{lang_pair_name}",
                             "doc_to_text": doc_to_text(src, tgt),
@@ -105,11 +316,19 @@ def gen_lang_yamls(output_dir: str, overwrite: bool) -> None:
 
 def main() -> None:
     parser = argparse.ArgumentParser()
-    parser.add_argument("--overwrite", default=False, action="store_true", help="Overwrite files if they already exist")
-    parser.add_argument( "--output-dir", default=".", help="Directory to write yaml files to" )
+    parser.add_argument(
+        "--overwrite",
+        default=False,
+        action="store_true",
+        help="Overwrite files if they already exist",
+    )
+    parser.add_argument(
+        "--output-dir", default=".", help="Directory to write yaml files to"
+    )
     args = parser.parse_args()
 
     gen_lang_yamls(output_dir=args.output_dir, overwrite=args.overwrite)
 
+
 if __name__ == "__main__":
     main()