Skip to content

Commit

Permalink
Merge pull request #6 from athina-ai/dev
Browse files Browse the repository at this point in the history
added example of eval suite
  • Loading branch information
vivek-athina authored Feb 23, 2024
2 parents 9c3762c + 835451e commit 4bc9f55
Show file tree
Hide file tree
Showing 2 changed files with 29 additions and 11 deletions.
37 changes: 28 additions & 9 deletions evaluations/run_athina_evals.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,24 @@
import os
import pandas as pd
from athina.evals import (
ContextContainsEnoughInformation,
DoesResponseAnswerQuery,
Faithfulness
)
from athina.loaders import RagLoader
from athina.runner.run import EvalRunner
from athina.keys import AthinaApiKey, OpenAiApiKey

from src.rag_application import RagApplication

dataset = None

from dotenv import load_dotenv
load_dotenv()

OpenAiApiKey.set_key(os.getenv('OPENAI_API_KEY'))
AthinaApiKey.set_key(os.getenv('ATHINA_API_KEY'))

dataset = None

def load_data():
app = RagApplication(openai_api_key=os.getenv('OPENAI_API_KEY'))
# Create batch dataset from list of dict objects
Expand All @@ -27,18 +33,12 @@ def load_data():

# # or read from file
# with open('evaluations/golden_dataset.jsonl', 'r') as file:
# raw_data = file.read()
# raw_data = raw_data.split('\n')
# raw_data = file.read().split('\n')
# data = []
# for item in raw_data:
# item = json.loads(item)
# item['context'], item['response'] = app.generate_response(item['query'])
# data.append(item)
# global dataset
# dataset = RagLoader().load_dict(data)
# pd.DataFrame(dataset)
# for item in raw_data:
# item['context'], item['response'] = app.generate_response(item['query'])

global dataset
dataset = RagLoader().load_dict(raw_data)
Expand All @@ -52,6 +52,7 @@ def evaluate_and_validate():
eval_model = "gpt-3.5-turbo"
df = DoesResponseAnswerQuery(model=eval_model).run_batch(data=dataset).to_df()
# Validation: Check if all rows in the dataframe passed the evaluation
df['passed'] = df['passed'].astype(bool)
all_passed = df['passed'].all()
if not all_passed:
failed_responses = df[~df['passed']]
Expand All @@ -64,6 +65,7 @@ def evaluate_and_validate():
# Validate whether the response is faithful to the context
df = Faithfulness(model=eval_model).run_batch(data=dataset).to_df()
# Validation: Check if all rows in the dataframe passed the evaluation
df['passed'] = df['passed'].astype(bool)
all_passed = df['passed'].all()
if not all_passed:
failed_responses = df[~df['passed']]
Expand All @@ -73,6 +75,23 @@ def evaluate_and_validate():
else:
print("All responses passed the evaluation.")

# # Run an entire suite of Evaluators as well
# eval_suite = [
# DoesResponseAnswerQuery(model=eval_model),
# Faithfulness(model=eval_model),
# ContextContainsEnoughInformation(model=eval_model),
# ]

# # Run the evaluation suite
# batch_eval_result = EvalRunner.run_suite(
# evals=eval_suite,
# data=dataset,
# max_parallel_evals=2
# )

# # Validate the batch_eval_results as you want.


if __name__ == "__main__":
load_data()
evaluate_and_validate()
3 changes: 1 addition & 2 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
llama_index==0.9.40
langchain-openai==0.0.3
openai==1.12.0
python-dotenv==1.0.1
python-dotenv==1.0.1

0 comments on commit 4bc9f55

Please sign in to comment.