stanford-crfm · sashimono-san · Nov 23, 2024 · Nov 23, 2024 · Dec 4, 2024 · Dec 5, 2024
diff --git a/docs/tutorial.md b/docs/tutorial.md
@@ -27,6 +27,10 @@ The meaning of the arguments are as follows:
 -  The environment directory is `prod_env/` by default and can be set using `--local-path`. Credentials for making API calls should be added to a `credentials.conf` file in this directory.
 -  The output directory is `benchmark_output/` by default and can be set using `--output-path`.
 
+> 📘 Good to know
+>
+> When running the command on cloud computes, sqlite may throw an `OperationalError: database is locked` because the environment directory is **network mounted**. To workaround the issue, set `--local-path` to a local path (typically `/tmp` and `/mnt`).
+
 After running this command, navigate to the `benchmark_output/runs/my-suite/` directory. This should contain a two sub-directories named `mmlu:subject=anatomy,model=openai_gpt2` and `mmlu:subject=philosophy,model=openai_gpt2`. Note that the names of these sub-directories is based on the run entries we used earlier, but with `/` replaced with `_`.
 
 Each output sub-directory will contain several JSON files that were generated during the corresponding run:

diff --git a/helm-frontend/project_metadata.json b/helm-frontend/project_metadata.json
@@ -3,7 +3,7 @@
 		"title": "Lite",
 		"description": "Lightweight, broad evaluation of the capabilities of language models using in-context learning",
 		"id": "lite",
-		"releases": ["v1.10.0", "v1.9.0", "v1.8.0", "v1.7.0", "v1.6.0", "v1.5.0", "v1.4.0", "v1.3.0", "v1.2.0", "v1.1.0", "v1.0.0"]
+		"releases": ["v1.11.0", "v1.10.0", "v1.9.0", "v1.8.0", "v1.7.0", "v1.6.0", "v1.5.0", "v1.4.0", "v1.3.0", "v1.2.0", "v1.1.0", "v1.0.0"]
 	},
 	{
 		"title": "Classic",
@@ -27,7 +27,7 @@
 		"title": "MMLU",
 		"description": "Massive Multitask Language Understanding (MMLU) evaluations using standardized prompts",
 		"id": "mmlu",
-		"releases": ["v1.10.0", "v1.9.0", "v1.8.0", "v1.7.0", "v1.6.0", "v1.5.0", "v1.4.0", "v1.3.0", "v1.2.0", "v1.1.0", "v1.0.0"]
+		"releases": ["v1.11.0", "v1.10.0", "v1.9.0", "v1.8.0", "v1.7.0", "v1.6.0", "v1.5.0", "v1.4.0", "v1.3.0", "v1.2.0", "v1.1.0", "v1.0.0"]
 	},
 	{
 		"title": "VHELM",

diff --git a/mkdocs.yml b/mkdocs.yml
@@ -51,7 +51,7 @@ nav:
     - get_helm_rank.md
     - benchmark.md
     - huggingface_models.md
-  - Multimodality:
+  - Papers:
     - heim.md
     - vhelm.md
   - Reference:

diff --git a/src/helm/benchmark/metrics/chain_of_thought_metric.py b/src/helm/benchmark/metrics/chain_of_thought_metric.py
@@ -0,0 +1,93 @@
+import re
+from typing import List, Optional
+
+from helm.benchmark.adaptation.adapter_spec import AdapterSpec
+from helm.benchmark.adaptation.request_state import RequestState
+from helm.benchmark.metrics.metric import Metric
+from helm.benchmark.metrics.metric_name import MetricName
+from helm.benchmark.metrics.metric_service import MetricService
+from helm.benchmark.metrics.statistic import Stat
+
+
+def extract_answer(output_text: str) -> Optional[str]:
+    """
+    Extracts the answer from the output text using two exact regex patterns.
+    Returns None if no valid answer is found.
+
+    Args:
+        output_text (str): The text from which to extract the answer.
+
+    Returns:
+        Optional[str]: The extracted answer (A-J) if found, otherwise None.
+    """
+    # First regex: Matches "answer is (A-J)" with optional parentheses
+    match = re.search(r"answer is \(?([A-J])\)?", output_text)
+    if match:
+        return match.group(1)
+
+    # Second regex: Matches "[answer: (A-J)]" with optional leading characters like "."
+    match = re.search(r"\.*\[aA\]nswer:\s*\(?([A-J])\)?", output_text)
+    if match:
+        return match.group(1)
+
+    # If neither regex matches, return None
+    return None
+
+
+class ChainOfThoughtMetric(Metric):
+    """
+    This metric focuses on structured reasoning and the accuracy of extracted answers.
+    It compares model outputs against correct answers provided in a multiple-choice
+    format and returns a score indicating the correctness of the generated response.
+    """
+
+    def evaluate_generation(
+        self,
+        adapter_spec: AdapterSpec,
+        request_state: RequestState,
+        metric_service: MetricService,
+        eval_cache_path: str,
+    ) -> List[Stat]:
+        """
+        Evaluate the generated output for chain-of-thought reasoning accuracy.
+
+        The method extracts the model's output, determines the correct answer
+        from the provided references, and compares the two to compute a binary score.
+
+        Args:
+            adapter_spec (AdapterSpec): Specification of the adapter used for the evaluation.
+            request_state (RequestState): The state of the current request, including
+                                          the input instance, output results, and references.
+            metric_service (MetricService): A service used to compute metrics if needed.
+            eval_cache_path (str): Path to the evaluation cache for storing or retrieving data.
+
+        Returns:
+            List[Stat]: A list containing a single `Stat` object with the correctness
+                        score (1 for correct, 0 for incorrect) under the metric
+                        name "chain_of_thought_correct".
+        """
+        # Assert that completions exist if the result is not None
+        assert (
+            request_state.result is not None and request_state.result.completions
+        ), "Request state result must have completions."
+
+        # Set output_text if the assertion passes
+        output_text = request_state.result.completions[0].text
+
+        # Extract the answer using the updated logic
+        extracted_answer = extract_answer(output_text)
+
+        # Find the correct answer from references by translating index to letter
+        correct_answer = None
+        for index, option in enumerate(request_state.instance.references):
+            if option.is_correct:
+                correct_answer = chr(65 + index)  # Translate index (0 -> A, 1 -> B, etc.)
+                break
+
+        # Raise an exception if no correct answer is found
+        if correct_answer is None:
+            raise ValueError(f"No correct answer found for instance ID {request_state.instance.id}")
+
+        # Compare extracted answer with the correct answer and compute the score
+        score = 1 if extracted_answer == correct_answer else 0
+        return [Stat(MetricName("chain_of_thought_correctness")).add(score)]
diff --git a/src/helm/benchmark/presentation/run_entries_speech.conf b/src/helm/benchmark/presentation/run_entries_speech.conf
@@ -6,6 +6,10 @@ entries: [
     {description: "vocal_sound:model=audiolm", priority: 1}
     {description: "audiocaps:model=audiolm", priority: 1}
     {description: "voxceleb2:model=audiolm", priority: 1}
+    {description: "air_bench_chat:subject=speech,model=audiolm", priority: 1}
+    {description: "air_bench_chat:subject=sound,model=audiolm", priority: 1}
+    {description: "air_bench_chat:subject=music,model=audiolm", priority: 1}
+    {description: "air_bench_chat:subject=mix,model=audiolm", priority: 1}
 
     ####################################################################################################################
     # Fairness

diff --git a/src/helm/benchmark/run_specs/audio_run_specs.py b/src/helm/benchmark/run_specs/audio_run_specs.py
@@ -373,3 +373,23 @@ def get_casual_conversations2_run_spec(subject: str) -> RunSpec:
         metric_specs=metric_specs,
         groups=[run_spec_name],
     )
+
+
+@run_spec_function("air_bench_chat")
+def get_air_bench_chat_run_spec(subject: str) -> RunSpec:
+    scenario_spec = ScenarioSpec(
+        class_name="helm.benchmark.scenarios.audio_language.air_bench_chat_scenario." "AirBenchChatScenario",
+        args={"subject": subject},
+    )
+    adapter_spec = _get_generation_adapter_spec(
+        max_tokens=50,
+    )
+    metric_specs: List[MetricSpec] = _get_open_ended_generation_metric_specs()
+    run_spec_name: str = "air_bench_chat"
+    return RunSpec(
+        name=f"{run_spec_name}:subject={subject}",
+        scenario_spec=scenario_spec,
+        adapter_spec=adapter_spec,
+        metric_specs=metric_specs,
+        groups=[run_spec_name],
+    )
diff --git a/src/helm/benchmark/run_specs/enem_challenge_specs.py b/src/helm/benchmark/run_specs/enem_challenge_specs.py
@@ -0,0 +1,31 @@
+from helm.benchmark.adaptation.adapters.adapter_factory import ADAPT_MULTIPLE_CHOICE_JOINT
+from helm.benchmark.adaptation.common_adapter_specs import get_multiple_choice_adapter_spec
+from helm.benchmark.metrics.common_metric_specs import get_exact_match_metric_specs
+from helm.benchmark.run_spec import RunSpec, run_spec_function
+from helm.benchmark.scenarios.scenario import ScenarioSpec
+
+
+@run_spec_function("enem_challenge")
+def get_enem_spec() -> RunSpec:
+    scenario_spec = ScenarioSpec(
+        class_name="helm.benchmark.scenarios.enem_challenge_scenario.ENEMChallengeScenario", args={}
+    )
+
+    adapter_spec = get_multiple_choice_adapter_spec(
+        method=ADAPT_MULTIPLE_CHOICE_JOINT,
+        instructions="Dê uma resposta selecionando uma letra entre as opções fornecidas. "
+        "Se as opções forem A, B, C, D e E, "
+        "sua resposta deve consistir em uma única letra que corresponde a resposta correta.\n"
+        "Exemplo: Qual é a capital da França?\nA. Londres\nB. Paris\nC. Roma\nD. Berlim\nE. Sydney\n"
+        "Resposta: B",
+        input_noun="Pergunta",
+        output_noun="Resposta",
+    )
+
+    return RunSpec(
+        name="enem_challenge",
+        scenario_spec=scenario_spec,
+        adapter_spec=adapter_spec,
+        metric_specs=get_exact_match_metric_specs(),
+        groups=["enem_challenge"],
+    )