Skip to content

Commit

Permalink
Merge pull request #11 from Clarifai/add-ragas
Browse files Browse the repository at this point in the history
[DEVX-385] - Add ragas
  • Loading branch information
isaac-chung authored May 15, 2024
2 parents a6afae9 + 59a1c6f commit 7450241
Show file tree
Hide file tree
Showing 9 changed files with 216 additions and 34 deletions.
2 changes: 2 additions & 0 deletions clarifai_model_utils/llm_eval/constant.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,10 @@

BGE_BASE_EMBED_MODEL = "https://clarifai.com/clarifai/main/models/BAAI-bge-base-en-v15"


@dataclass
class JUDGE_LLMS:
GPT3_5_TURBO = "https://clarifai.com/openai/chat-completion/models/GPT-3_5-turbo"
LLAMA2_CHAT_70B = "https://clarifai.com/meta/Llama-2/models/llama2-70b-chat"
GPT4 = "https://clarifai.com/openai/chat-completion/models/GPT-4"
DBRX_INSTRUCT = "https://clarifai.com/databricks/drbx/models/dbrx-instruct"
Original file line number Diff line number Diff line change
Expand Up @@ -27,9 +27,9 @@ where:

#### 1.2. Dataset

Since the initial goal is to evaluate model and dataset uploaded in the platform. The harness-eval consumes hf dataset to run, so we first download Clarifai dataset to temporary file and assign it to harness-eval the config (Task) see [here](https://github.com/Clarifai/clarifai-model-utils/blob/main/clarifai_model_utils/llm_eval/evaluator/harness_eval/evaluate.py)
The initial goal is to evaluate a model and a dataset uploaded in the platform. The harness-eval consumes hf dataset to run, so we first download Clarifai dataset to temporary file and assign it to harness-eval the config (Task) see [here](https://github.com/Clarifai/clarifai-model-utils/blob/main/clarifai_model_utils/llm_eval/evaluator/harness_eval/evaluate.py)

For now, it also supports huggingface dataset. See this [#6 PR](https://github.com/Clarifai/clarifai-model-utils/pull/6)
It also supports huggingface dataset. See this [#6 PR](https://github.com/Clarifai/clarifai-model-utils/pull/6)

### 2. [llm.py](llm.py)

Expand Down
66 changes: 42 additions & 24 deletions clarifai_model_utils/llm_eval/evaluator/harness_eval/evaluate.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,13 +14,13 @@
from clarifai.client.model import Model
from clarifai.client.workflow import Workflow
from clarifai.utils.logging import get_logger

try:
from .llm import ClarifaiLM # noqa # pylint: disable=unused-import
except:
pass

from .output import DatasetInfo, EvaluateResult, LmJudgeInfo, PromptTemplate, make_result_dataframe
from .python_templates import PYTHON_TEMPLATES

logger = get_logger(name=__file__)

Expand Down Expand Up @@ -154,19 +154,23 @@ def call_harness_eval(self,

return results

def prepare_config(self, template, data_file: str = None):
def prepare_config(self, template, data_file: str = None, **kwargs):
"""Verify if 'template' is in defined templates or dict
Args:
template (Union[str, dict])
data_file (str, optional): path to dataframe if using dataset_path in config. Defaults to None.
kwargs: keyword args to load template
Returns:
dict
"""
config = {}
if template in self.templates:
config = deepcopy(self.template_configs[template]["config"])
elif template in PYTHON_TEMPLATES:
config = PYTHON_TEMPLATES[template](**kwargs)
config = config.to_harness_dict_config()
elif type(template) == TaskConfig:
config = template.to_dict(keep_callable=True)
elif isinstance(template, dict):
Expand All @@ -185,25 +189,24 @@ def prepare_config(self, template, data_file: str = None):
config['dataset_kwargs'] = dict(data_files=dict(validation=data_file))
return config

def evaluate(
self,
predictor: Union[Model, Workflow],
data_frame: pd.DataFrame,
template: str,
weights: dict = {},
regex_code: str = "",
input_prompt: str = "",
judge_llm_url: str = "",
custom_config: dict = {
"num_fewshot": 0,
},
inference_parameters: dict = {},
predictor_kwargs: dict = {},
eval_id: str = None,
dataset_info: dict = None,
workflow_output_node: int = 1,
is_rag_workflow: bool = None,
) -> EvaluateResult:
def evaluate(self,
predictor: Union[Model, Workflow],
data_frame: pd.DataFrame,
template: str,
weights: dict = {},
regex_code: str = "",
input_prompt: str = "",
judge_llm_url: str = "",
custom_config: dict = {
"num_fewshot": 0,
},
inference_parameters: dict = {},
predictor_kwargs: dict = {},
eval_id: str = None,
dataset_info: dict = None,
workflow_output_node: int = 1,
is_rag_workflow: bool = None,
**kwargs) -> EvaluateResult:
"""Evaluate
Args:
predictor (Union[Model, Workflow]): Model/Workflow or Url
Expand Down Expand Up @@ -232,7 +235,10 @@ def evaluate(

_template = deepcopy(template)

config = self.prepare_config(_template, _file.name)
config = self.prepare_config(
_template,
_file.name,
)
config.update(custom_config)
template_name = config.get("task", None) or template
# checking weights config
Expand All @@ -248,15 +254,27 @@ def evaluate(
config.update(filters)

judge_model = None
if template_name in ['llm_as_judge', 'rag']:
if template_name in ['llm_as_judge', 'rag', 'ragas']:
assert judge_llm_url, ValueError(
f"Please provide judge_llm_url for template llm_as_judge or rag")
judge_model = LmJudgeInfo(
url=judge_llm_url, pat=predictor.auth_helper._pat, token=predictor.auth_helper._token)
logger.debug(judge_model)
if 'rag' in template_name or is_rag_workflow:
assert isinstance(predictor, Workflow), "Require Workflow predictor to evaluate RAG"
config.update(dict(process_results=judge_model.judge.process_rag_result))
if template_name == 'rag':
config.update(dict(process_results=judge_model.judge.process_rag_result))
elif template_name == 'ragas':
_ragas_config = PYTHON_TEMPLATES[template_name](
langchain_llm_kwargs=dict(
model_url=judge_llm_url,
pat=predictor.auth_helper._pat,
token=predictor.auth_helper._token),
has_ground_truth="ground_truth" in data_frame.columns)
config.update(
dict(
process_results=_ragas_config.process_results_func,
metric_list=_ragas_config.config.metric_list))
is_rag_workflow = True
elif template_name == 'llm_as_judge':
config.update(dict(process_results=judge_model.judge_process_result_func))
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
from .ragas_eval import RAGAS

PYTHON_TEMPLATES = {"ragas": RAGAS}

ALL_PYTHON_TEMPLATES = list(PYTHON_TEMPLATES.keys())
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
from abc import ABC, abstractmethod


class _BasePythonTemplate(ABC):

@abstractmethod
def to_harness_dict_config(self) -> dict:
"""convert current config to Harness Eval TaskConfig Dictionary"""
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
import logging
import math
from dataclasses import asdict, dataclass, field
from typing import Dict, List, Optional

from datasets import Dataset as HFDataset
from langchain_community.embeddings import ClarifaiEmbeddings
from langchain_community.llms import Clarifai
from lm_eval.api.task import TaskConfig
from ragas import evaluate
from ragas.llms import LangchainLLMWrapper
from ragas.metrics import answer_relevancy, context_precision, context_recall, faithfulness

from ....constant import BGE_BASE_EMBED_MODEL
from .base import _BasePythonTemplate

logger = logging.getLogger("ragas")
logger.disabled = True


@dataclass
class RAGAS(_BasePythonTemplate):

langchain_llm_kwargs: dict = field(default_factory=lambda: {})
langchain_llm: Optional[Clarifai] = None
has_ground_truth: Optional[bool] = True
embedder: Optional[ClarifaiEmbeddings] = None
config: TaskConfig = field(default_factory=TaskConfig)

def __post_init__(self) -> None:

self.config.task = "ragas"
self.config.group = "ragas"
self.config.dataset_path = "csv"
self.config.dataset_name = None
self.config.output_type = "generate_until"
self.config.validation_split = "validation"
self.config.doc_to_text = "{{question}}"
self.config.doc_to_target = ""
self.config.repeats = 1
self.config.num_fewshot = 0

self.config.metric_list = [{
"metric": "faithfulness",
"aggregation": "mean",
"higher_is_better": True,
}, {
"metric": "answer_relevancy",
"aggregation": "mean",
"higher_is_better": True,
}]

self.ragas_metrics = [
faithfulness,
answer_relevancy,
]

if self.has_ground_truth:
self.config.metric_list.extend([{
"metric": "context_precision",
"aggregation": "mean",
"higher_is_better": True,
}, {
"metric": "context_recall",
"aggregation": "mean",
"higher_is_better": True,
}])
self.ragas_metrics.extend([context_precision, context_recall])

self.config.process_results = self.process_results_func

def process_results_func(self, doc: dict, results: List[List]) -> Dict[str, float]:
"""Compute RAGAS metrics per row of dataset
Args:
doc (dict): A dictionary representing a row of dataset data. It must include `question` and optionally `ground_truth` if `has_ground_truth` is set.
results (List[List]): result list has length equal to batch size (1) contains [context, answer] of RAG workflow
Example:
>>> doc = dict(question="What is Clarifai?")
>>> rag_results = [["Context: Long document about Clarifai...", "Clarifai is Clarifai is the leading Full Stack AI, LLM, and computer vision production platform..."]]
>>> scores = ragas_eval_instance.process_results_func(doc, rag_results)
>>> print(scores)
>>> {'faithfulness': 0.999, 'answer_relevancy': 0.99}
Returns:
Dict[str, float]: ragas scores
"""
assert isinstance(results,
list) and len(results[0]) > 1, "results must be a list of [context, answer]"

pat = self.langchain_llm_kwargs.get("pat", None)
token = self.langchain_llm_kwargs.get("token", None)
if self.embedder is None:
self.embedder = ClarifaiEmbeddings(model_url=BGE_BASE_EMBED_MODEL, pat=pat, token=token)
self.langchain_llm = Clarifai(**self.langchain_llm_kwargs)
# context from Clarifai RAG workflow
# NOTE: discard context in dataset only use context of RAG workflow
context = results[0][0]
# answer from Clarifai RAG workflow
answer = results[0][1]
# Take value of question
question = doc["question"]
try:
ground_truth = doc["ground_truth"]
except:
ground_truth = ""
data = {
"question": [question],
"answer": [answer],
"contexts": [[context]],
"ground_truth": [ground_truth]
}
dataset = HFDataset.from_dict(data)

ragas_results = evaluate(
dataset=dataset,
llm=LangchainLLMWrapper(self.langchain_llm),
embeddings=self.embedder,
metrics=self.ragas_metrics)

# FIXME: replace nan value as 0.
return {k: 0. if math.isnan(v) else v for k, v in ragas_results.items()}

def to_harness_dict_config(self) -> dict:
d = asdict(self.config)
return d
5 changes: 1 addition & 4 deletions clarifai_model_utils/llm_eval/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -318,11 +318,8 @@ def evaluate(self,
generate_qa=generate_qa)
elif isinstance(dataset, pd.DataFrame): # local data
df = dataset
assert dataset_id or app_id, ValueError(
f"`dataset_id` or `app_id` is empty when using local dataset. Please pass them to kwargs"
)
elif isinstance(dataset, HFDataset):
df = dataset.to_pandas(batched=True)
df = dataset.to_pandas(batched=False)
elif template in HARNESS_EVAL_TASK_MANAGER.all_tasks:
df = None
dataset_id = template
Expand Down
29 changes: 29 additions & 0 deletions examples/ragas_example.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
import os

import pandas as pd

from clarifai.client import Workflow
from clarifai_model_utils import ClarifaiEvaluator
from clarifai_model_utils.llm_eval.constant import JUDGE_LLMS

# set PAT
os.environ["CLARIFAI_PAT"] = ""

# Load Clarifai RAG workflow
wf = Workflow(url= ...)

evaluator = ClarifaiEvaluator(predictor=wf, is_rag_workflow=True)

# Create a dummy dataset
df = [dict(question="What is WC 2022"), dict(question="Who won the title?")]
df = pd.DataFrame(df)

# Run evaluate
out = evaluator.evaluate(
template="ragas",
upload=False,
judge_llm_url=JUDGE_LLMS.DBRX_INSTRUCT, # use databricks/DBRX-Instruct in RAGAS
dataset=df,
)

print(out.df_to_pandas())
4 changes: 0 additions & 4 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,3 @@ requests==2.31.0
tqdm==4.66.1
sacrebleu==2.4.0
datasets==2.17.1
langchain==0.1.6
langchain-community==0.0.20
langchain-core==0.1.45
langsmith==0.0.87

0 comments on commit 7450241

Please sign in to comment.