stanford-crfm · yifanmai · Jan 22, 2025 · Jan 22, 2025 · Jan 22, 2025 · Jan 22, 2025
diff --git a/src/helm/benchmark/run_specs/enterprise_run_specs.py b/src/helm/benchmark/run_specs/enterprise_run_specs.py
@@ -15,9 +15,6 @@
 from helm.benchmark.scenarios.scenario import ScenarioSpec
 
 
-# Finance
-
-
 @run_spec_function("gold_commodity_news")
 def get_news_headline_spec(category: str) -> RunSpec:
     from helm.benchmark.scenarios.gold_commodity_news_scenario import GoldCommodityNewsScenario
@@ -44,7 +41,7 @@ def get_news_headline_spec(category: str) -> RunSpec:
 
 
 @run_spec_function("legal_contract_summarization")
-def get_legal_contract_spec() -> RunSpec:
+def get_legal_contract_summarization_spec() -> RunSpec:
     scenario_spec = ScenarioSpec(
         class_name="helm.benchmark.scenarios.legal_contract_summarization_scenario.LegalContractSummarizationScenario",
         args={},
@@ -67,6 +64,28 @@ def get_legal_contract_spec() -> RunSpec:
     )
 
 
+@run_spec_function("legal_opinion_sentiment_classification")
+def get_legal_opinion_sentiment_classification_spec() -> RunSpec:
+    scenario_spec = ScenarioSpec(
+        class_name="helm.benchmark.scenarios.legal_opinion_sentiment_classification_scenario.LegalOpinionSentimentClassificationScenario",  # noqa: E501
+    )
+
+    instructions = "Classify the sentences into one of the 3 sentiment categories. Possible labels: positive, neutral, negative."  # noqa: E501
+    adapter_spec = get_generation_adapter_spec(
+        instructions=instructions,
+        output_noun="Label",
+    )
+
+    return RunSpec(
+        name="legal_opinion_sentiment_classification",
+        scenario_spec=scenario_spec,
+        adapter_spec=adapter_spec,
+        # TODO: Switch to using weighted F1
+        metric_specs=get_exact_match_metric_specs() + get_classification_metric_specs(),
+        groups=["legal_opinion_sentiment_classification"],
+    )
+
+
 @run_spec_function("casehold")
 def get_casehold_spec() -> RunSpec:
     scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.casehold_scenario.CaseHOLDScenario", args={})

diff --git a/src/helm/benchmark/scenarios/legal_opinion_sentiment_classification_scenario.py b/src/helm/benchmark/scenarios/legal_opinion_sentiment_classification_scenario.py
@@ -0,0 +1,77 @@
+import os
+from typing import List
+
+import pandas as pd
+
+from helm.benchmark.scenarios.scenario import (
+    Scenario,
+    Instance,
+    Reference,
+    TRAIN_SPLIT,
+    TEST_SPLIT,
+    CORRECT_TAG,
+    Input,
+    Output,
+)
+from helm.common.general import ensure_file_downloaded, ensure_directory_exists
+
+
+class LegalOpinionSentimentClassificationScenario(Scenario):
+    """
+    A legal opinion sentiment classification task based on the paper
+    Effective Approach to Develop a Sentiment Annotator For Legal Domain in a Low Resource Setting
+    [(Ratnayaka et al., 2020)](https://arxiv.org/pdf/2011.00318.pdf).
+
+    Example prompt:
+    Classify the sentences into one of the 3 sentiment categories. Possible labels: positive, neutral, negative.
+    {Sentence}
+    Label: {positive/neutral/negative}
+
+    """
+
+    # Names of the tasks we support
+
+    name = "legal_opinion"
+    description = "Predicting the sentiment of the legal text in the positive, negative, or neutral."
+    tags = ["classification", "sentiment analysis", "legal"]
+
+    SENTIMENT_CLASSES = ["positive", "negative", "neutral"]
+    SPLIT_TO_URL = {
+        TRAIN_SPLIT: "https://osf.io/download/hfn62/",
+        TEST_SPLIT: "https://osf.io/download/q4adh/",
+    }
+
+    def create_instances(self, df: pd.DataFrame, split: str) -> List[Instance]:
+        instances: List[Instance] = []
+        assert split in [TRAIN_SPLIT, TEST_SPLIT]
+        if split == TRAIN_SPLIT:
+            phrase_column_name = "Phrase"
+            label_column_name = "Label"
+        else:
+            phrase_column_name = "sentence"
+            label_column_name = "label"
+        for row in df.itertuples():
+            phrase = getattr(row, phrase_column_name)
+            label_index = int(getattr(row, label_column_name))
+            label = LegalOpinionSentimentClassificationScenario.SENTIMENT_CLASSES[label_index]
+            instance = Instance(
+                input=Input(text=phrase), references=[Reference(Output(text=label), tags=[CORRECT_TAG])], split=split
+            )
+            instances.append(instance)
+        return instances
+
+    def get_instances(self, output_path: str) -> List[Instance]:
+        self.data_dir = os.path.join(output_path, "data")
+        data_dir = self.data_dir
+        ensure_directory_exists(data_dir)
+        instances: List[Instance] = []
+        for split, url in LegalOpinionSentimentClassificationScenario.SPLIT_TO_URL.items():
+            file_name = f"{split.lower()}.xlsx"
+            file_path = os.path.join(data_dir, file_name)
+            ensure_file_downloaded(
+                source_url=url,
+                target_path=os.path.join(data_dir, file_name),
+            )
+            df = pd.read_excel(file_path)
+            instances.extend(self.create_instances(df, split))
+        return instances
diff --git a/src/helm/benchmark/static/schema_enterprise.yaml b/src/helm/benchmark/static/schema_enterprise.yaml
@@ -116,6 +116,8 @@ run_groups:
     category: All scenarios
     subgroups:
       - legal_contract_summarization
+      - legal_opinion_sentiment_classification
+      - casehold
 
   - name: climate_scenarios
     display_name: Climate Scenarios
@@ -182,6 +184,22 @@ run_groups:
       when: before 2021
       language: English
 
+  - name: legal_opinion_sentiment_classification
+    display_name: Legal Opinion Sentiment Classification
+    description: A legal opinion sentiment classification task based on the paper Effective Approach to Develop a Sentiment Annotator For Legal Domain in a Low Resource Setting [(Ratnayaka et al., 2020)](https://arxiv.org/pdf/2011.00318.pdf).
+    metric_groups:
+      - accuracy
+      - general_information
+    environment:
+      main_name: quasi_exact_match
+      main_split: test
+    taxonomy:
+      task: sentiment analysis
+      what: United States legal opinion texts
+      who: United States courts
+      when: Before 2020
+      language: English
+
   - name: sumosum
     display_name: SUMO Web Claims Summarization
     description: A summarization benchmark based on the climate subset of the SUMO dataset ([Mishra et al., 2020](https://aclanthology.org/2020.wnut-1.12/)).