Merge pull request #11 from Clarifai/add-ragas

[DEVX-385] - Add ragas
Clarifai · May 15, 2024 · 7450241 · 7450241
2 parents a6afae9 + 59a1c6f
commit 7450241
Show file tree

Hide file tree

Showing 9 changed files with 216 additions and 34 deletions.
diff --git a/clarifai_model_utils/llm_eval/constant.py b/clarifai_model_utils/llm_eval/constant.py
@@ -5,8 +5,10 @@
 
 BGE_BASE_EMBED_MODEL = "https://clarifai.com/clarifai/main/models/BAAI-bge-base-en-v15"
 
+
 @dataclass
 class JUDGE_LLMS:
   GPT3_5_TURBO = "https://clarifai.com/openai/chat-completion/models/GPT-3_5-turbo"
   LLAMA2_CHAT_70B = "https://clarifai.com/meta/Llama-2/models/llama2-70b-chat"
   GPT4 = "https://clarifai.com/openai/chat-completion/models/GPT-4"
+  DBRX_INSTRUCT = "https://clarifai.com/databricks/drbx/models/dbrx-instruct"
diff --git a/clarifai_model_utils/llm_eval/evaluator/harness_eval/Readme.md b/clarifai_model_utils/llm_eval/evaluator/harness_eval/Readme.md
@@ -27,9 +27,9 @@ where:
 
 #### 1.2. Dataset
 
-Since the initial goal is to evaluate model and dataset uploaded in the platform. The harness-eval consumes hf dataset to run, so we first download Clarifai dataset to temporary file and assign it to harness-eval the config (Task) see [here](https://github.com/Clarifai/clarifai-model-utils/blob/main/clarifai_model_utils/llm_eval/evaluator/harness_eval/evaluate.py)
+The initial goal is to evaluate a model and a dataset uploaded in the platform. The harness-eval consumes hf dataset to run, so we first download Clarifai dataset to temporary file and assign it to harness-eval the config (Task) see [here](https://github.com/Clarifai/clarifai-model-utils/blob/main/clarifai_model_utils/llm_eval/evaluator/harness_eval/evaluate.py)
 
-For now, it also supports huggingface dataset. See this [#6 PR](https://github.com/Clarifai/clarifai-model-utils/pull/6)
+It also supports huggingface dataset. See this [#6 PR](https://github.com/Clarifai/clarifai-model-utils/pull/6)
 
 ### 2. [llm.py](llm.py)
 

diff --git a/clarifai_model_utils/llm_eval/evaluator/harness_eval/evaluate.py b/clarifai_model_utils/llm_eval/evaluator/harness_eval/evaluate.py
@@ -14,13 +14,13 @@
 from clarifai.client.model import Model
 from clarifai.client.workflow import Workflow
 from clarifai.utils.logging import get_logger
-
 try:
   from .llm import ClarifaiLM  # noqa # pylint: disable=unused-import
 except:
   pass
 
 from .output import DatasetInfo, EvaluateResult, LmJudgeInfo, PromptTemplate, make_result_dataframe
+from .python_templates import PYTHON_TEMPLATES
 
 logger = get_logger(name=__file__)
 
@@ -154,19 +154,23 @@ def call_harness_eval(self,
 
     return results
 
-  def prepare_config(self, template, data_file: str = None):
+  def prepare_config(self, template, data_file: str = None, **kwargs):
     """Verify if 'template' is in defined templates or dict
 
     Args:
         template (Union[str, dict])
         data_file (str, optional): path to dataframe if using dataset_path in config. Defaults to None.
+        kwargs: keyword args to load template
 
     Returns:
         dict
     """
     config = {}
     if template in self.templates:
       config = deepcopy(self.template_configs[template]["config"])
+    elif template in PYTHON_TEMPLATES:
+      config = PYTHON_TEMPLATES[template](**kwargs)
+      config = config.to_harness_dict_config()
     elif type(template) == TaskConfig:
       config = template.to_dict(keep_callable=True)
     elif isinstance(template, dict):
@@ -185,25 +189,24 @@ def prepare_config(self, template, data_file: str = None):
       config['dataset_kwargs'] = dict(data_files=dict(validation=data_file))
     return config
 
-  def evaluate(
-      self,
-      predictor: Union[Model, Workflow],
-      data_frame: pd.DataFrame,
-      template: str,
-      weights: dict = {},
-      regex_code: str = "",
-      input_prompt: str = "",
-      judge_llm_url: str = "",
-      custom_config: dict = {
-          "num_fewshot": 0,
-      },
-      inference_parameters: dict = {},
-      predictor_kwargs: dict = {},
-      eval_id: str = None,
-      dataset_info: dict = None,
-      workflow_output_node: int = 1,
-      is_rag_workflow: bool = None,
-  ) -> EvaluateResult:
+  def evaluate(self,
+               predictor: Union[Model, Workflow],
+               data_frame: pd.DataFrame,
+               template: str,
+               weights: dict = {},
+               regex_code: str = "",
+               input_prompt: str = "",
+               judge_llm_url: str = "",
+               custom_config: dict = {
+                   "num_fewshot": 0,
+               },
+               inference_parameters: dict = {},
+               predictor_kwargs: dict = {},
+               eval_id: str = None,
+               dataset_info: dict = None,
+               workflow_output_node: int = 1,
+               is_rag_workflow: bool = None,
+               **kwargs) -> EvaluateResult:
     """Evaluate
     Args:
         predictor (Union[Model, Workflow]): Model/Workflow or Url
@@ -232,7 +235,10 @@ def evaluate(
 
       _template = deepcopy(template)
 
-      config = self.prepare_config(_template, _file.name)
+      config = self.prepare_config(
+          _template,
+          _file.name,
+      )
       config.update(custom_config)
       template_name = config.get("task", None) or template
       # checking weights config
@@ -248,15 +254,27 @@ def evaluate(
         config.update(filters)
 
       judge_model = None
-      if template_name in ['llm_as_judge', 'rag']:
+      if template_name in ['llm_as_judge', 'rag', 'ragas']:
         assert judge_llm_url, ValueError(
             f"Please provide judge_llm_url for template llm_as_judge or rag")
         judge_model = LmJudgeInfo(
             url=judge_llm_url, pat=predictor.auth_helper._pat, token=predictor.auth_helper._token)
         logger.debug(judge_model)
         if 'rag' in template_name or is_rag_workflow:
           assert isinstance(predictor, Workflow), "Require Workflow predictor to evaluate RAG"
-          config.update(dict(process_results=judge_model.judge.process_rag_result))
+          if template_name == 'rag':
+            config.update(dict(process_results=judge_model.judge.process_rag_result))
+          elif template_name == 'ragas':
+            _ragas_config = PYTHON_TEMPLATES[template_name](
+                langchain_llm_kwargs=dict(
+                    model_url=judge_llm_url,
+                    pat=predictor.auth_helper._pat,
+                    token=predictor.auth_helper._token),
+                has_ground_truth="ground_truth" in data_frame.columns)
+            config.update(
+                dict(
+                    process_results=_ragas_config.process_results_func,
+                    metric_list=_ragas_config.config.metric_list))
           is_rag_workflow = True
         elif template_name == 'llm_as_judge':
           config.update(dict(process_results=judge_model.judge_process_result_func))

diff --git a/clarifai_model_utils/llm_eval/evaluator/harness_eval/python_templates/__init__.py b/clarifai_model_utils/llm_eval/evaluator/harness_eval/python_templates/__init__.py
@@ -0,0 +1,5 @@
+from .ragas_eval import RAGAS
+
+PYTHON_TEMPLATES = {"ragas": RAGAS}
+
+ALL_PYTHON_TEMPLATES = list(PYTHON_TEMPLATES.keys())
diff --git a/clarifai_model_utils/llm_eval/evaluator/harness_eval/python_templates/base.py b/clarifai_model_utils/llm_eval/evaluator/harness_eval/python_templates/base.py
@@ -0,0 +1,8 @@
+from abc import ABC, abstractmethod
+
+
+class _BasePythonTemplate(ABC):
+
+  @abstractmethod
+  def to_harness_dict_config(self) -> dict:
+    """convert current config to Harness Eval TaskConfig Dictionary"""
diff --git a/clarifai_model_utils/llm_eval/evaluator/harness_eval/python_templates/ragas_eval.py b/clarifai_model_utils/llm_eval/evaluator/harness_eval/python_templates/ragas_eval.py
@@ -0,0 +1,127 @@
+import logging
+import math
+from dataclasses import asdict, dataclass, field
+from typing import Dict, List, Optional
+
+from datasets import Dataset as HFDataset
+from langchain_community.embeddings import ClarifaiEmbeddings
+from langchain_community.llms import Clarifai
+from lm_eval.api.task import TaskConfig
+from ragas import evaluate
+from ragas.llms import LangchainLLMWrapper
+from ragas.metrics import answer_relevancy, context_precision, context_recall, faithfulness
+
+from ....constant import BGE_BASE_EMBED_MODEL
+from .base import _BasePythonTemplate
+
+logger = logging.getLogger("ragas")
+logger.disabled = True
+
+
+@dataclass
+class RAGAS(_BasePythonTemplate):
+
+  langchain_llm_kwargs: dict = field(default_factory=lambda: {})
+  langchain_llm: Optional[Clarifai] = None
+  has_ground_truth: Optional[bool] = True
+  embedder: Optional[ClarifaiEmbeddings] = None
+  config: TaskConfig = field(default_factory=TaskConfig)
+
+  def __post_init__(self) -> None:
+
+    self.config.task = "ragas"
+    self.config.group = "ragas"
+    self.config.dataset_path = "csv"
+    self.config.dataset_name = None
+    self.config.output_type = "generate_until"
+    self.config.validation_split = "validation"
+    self.config.doc_to_text = "{{question}}"
+    self.config.doc_to_target = ""
+    self.config.repeats = 1
+    self.config.num_fewshot = 0
+
+    self.config.metric_list = [{
+        "metric": "faithfulness",
+        "aggregation": "mean",
+        "higher_is_better": True,
+    }, {
+        "metric": "answer_relevancy",
+        "aggregation": "mean",
+        "higher_is_better": True,
+    }]
+
+    self.ragas_metrics = [
+        faithfulness,
+        answer_relevancy,
+    ]
+
+    if self.has_ground_truth:
+      self.config.metric_list.extend([{
+          "metric": "context_precision",
+          "aggregation": "mean",
+          "higher_is_better": True,
+      }, {
+          "metric": "context_recall",
+          "aggregation": "mean",
+          "higher_is_better": True,
+      }])
+      self.ragas_metrics.extend([context_precision, context_recall])
+
+    self.config.process_results = self.process_results_func
+
+  def process_results_func(self, doc: dict, results: List[List]) -> Dict[str, float]:
+    """Compute RAGAS metrics per row of dataset
+
+    Args:
+        doc (dict): A dictionary representing a row of dataset data. It must include `question` and optionally `ground_truth` if `has_ground_truth` is set.
+        results (List[List]): result list has length equal to batch size (1) contains [context, answer] of RAG workflow
+
+    Example:
+    >>> doc = dict(question="What is Clarifai?")
+    >>> rag_results = [["Context: Long document about Clarifai...", "Clarifai is Clarifai is the leading Full Stack AI, LLM, and computer vision production platform..."]]
+    >>> scores = ragas_eval_instance.process_results_func(doc, rag_results)
+    >>> print(scores)
+    >>> {'faithfulness': 0.999, 'answer_relevancy': 0.99}
+
+    Returns:
+        Dict[str, float]: ragas scores
+    """
+    assert isinstance(results,
+                      list) and len(results[0]) > 1, "results must be a list of [context, answer]"
+
+    pat = self.langchain_llm_kwargs.get("pat", None)
+    token = self.langchain_llm_kwargs.get("token", None)
+    if self.embedder is None:
+      self.embedder = ClarifaiEmbeddings(model_url=BGE_BASE_EMBED_MODEL, pat=pat, token=token)
+    self.langchain_llm = Clarifai(**self.langchain_llm_kwargs)
+    # context from Clarifai RAG workflow
+    # NOTE: discard context in dataset only use context of RAG workflow
+    context = results[0][0]
+    # answer from Clarifai RAG workflow
+    answer = results[0][1]
+    # Take value of question
+    question = doc["question"]
+    try:
+      ground_truth = doc["ground_truth"]
+    except:
+      ground_truth = ""
+    data = {
+        "question": [question],
+        "answer": [answer],
+        "contexts": [[context]],
+        "ground_truth": [ground_truth]
+    }
+    dataset = HFDataset.from_dict(data)
+
+    ragas_results = evaluate(
+        dataset=dataset,
+        llm=LangchainLLMWrapper(self.langchain_llm),
+        embeddings=self.embedder,
+        metrics=self.ragas_metrics)
+
+    # FIXME: replace nan value as 0.
+    return {k: 0. if math.isnan(v) else v for k, v in ragas_results.items()}
+
+  def to_harness_dict_config(self) -> dict:
+    d = asdict(self.config)
+    return d
diff --git a/clarifai_model_utils/llm_eval/main.py b/clarifai_model_utils/llm_eval/main.py
@@ -318,11 +318,8 @@ def evaluate(self,
           generate_qa=generate_qa)
     elif isinstance(dataset, pd.DataFrame):  # local data
       df = dataset
-      assert dataset_id or app_id, ValueError(
-          f"`dataset_id` or `app_id` is empty when using local dataset. Please pass them to kwargs"
-      )
     elif isinstance(dataset, HFDataset):
-      df = dataset.to_pandas(batched=True)
+      df = dataset.to_pandas(batched=False)
     elif template in HARNESS_EVAL_TASK_MANAGER.all_tasks:
       df = None
       dataset_id = template

diff --git a/examples/ragas_example.py b/examples/ragas_example.py
@@ -0,0 +1,29 @@
+import os
+
+import pandas as pd
+
+from clarifai.client import Workflow
+from clarifai_model_utils import ClarifaiEvaluator
+from clarifai_model_utils.llm_eval.constant import JUDGE_LLMS
+
+# set PAT
+os.environ["CLARIFAI_PAT"] = ""
+
+# Load Clarifai RAG workflow
+wf = Workflow(url= ...)
+
+evaluator = ClarifaiEvaluator(predictor=wf, is_rag_workflow=True)
+
+# Create a dummy dataset
+df = [dict(question="What is WC 2022"), dict(question="Who won the title?")]
+df = pd.DataFrame(df)
+
+# Run evaluate
+out = evaluator.evaluate(
+    template="ragas",
+    upload=False,
+    judge_llm_url=JUDGE_LLMS.DBRX_INSTRUCT,  # use databricks/DBRX-Instruct in RAGAS
+    dataset=df,
+)
+
+print(out.df_to_pandas())
diff --git a/requirements.txt b/requirements.txt
@@ -10,7 +10,3 @@ requests==2.31.0
 tqdm==4.66.1
 sacrebleu==2.4.0
 datasets==2.17.1
-langchain==0.1.6
-langchain-community==0.0.20
-langchain-core==0.1.45
-langsmith==0.0.87