diff --git a/evaluations/run_athina_evals.py b/evaluations/run_athina_evals.py index 62f30a9..72459c1 100644 --- a/evaluations/run_athina_evals.py +++ b/evaluations/run_athina_evals.py @@ -1,18 +1,24 @@ import os import pandas as pd from athina.evals import ( + ContextContainsEnoughInformation, DoesResponseAnswerQuery, Faithfulness ) from athina.loaders import RagLoader +from athina.runner.run import EvalRunner from athina.keys import AthinaApiKey, OpenAiApiKey + from src.rag_application import RagApplication +dataset = None + +from dotenv import load_dotenv +load_dotenv() + OpenAiApiKey.set_key(os.getenv('OPENAI_API_KEY')) AthinaApiKey.set_key(os.getenv('ATHINA_API_KEY')) -dataset = None - def load_data(): app = RagApplication(openai_api_key=os.getenv('OPENAI_API_KEY')) # Create batch dataset from list of dict objects @@ -27,18 +33,12 @@ def load_data(): # # or read from file # with open('evaluations/golden_dataset.jsonl', 'r') as file: - # raw_data = file.read() - # raw_data = raw_data.split('\n') + # raw_data = file.read().split('\n') # data = [] # for item in raw_data: # item = json.loads(item) # item['context'], item['response'] = app.generate_response(item['query']) # data.append(item) - # global dataset - # dataset = RagLoader().load_dict(data) - # pd.DataFrame(dataset) - # for item in raw_data: - # item['context'], item['response'] = app.generate_response(item['query']) global dataset dataset = RagLoader().load_dict(raw_data) @@ -52,6 +52,7 @@ def evaluate_and_validate(): eval_model = "gpt-3.5-turbo" df = DoesResponseAnswerQuery(model=eval_model).run_batch(data=dataset).to_df() # Validation: Check if all rows in the dataframe passed the evaluation + df['passed'] = df['passed'].astype(bool) all_passed = df['passed'].all() if not all_passed: failed_responses = df[~df['passed']] @@ -64,6 +65,7 @@ def evaluate_and_validate(): # Validate whether the response is faithful to the context df = Faithfulness(model=eval_model).run_batch(data=dataset).to_df() # Validation: Check if all rows in the dataframe passed the evaluation + df['passed'] = df['passed'].astype(bool) all_passed = df['passed'].all() if not all_passed: failed_responses = df[~df['passed']] @@ -73,6 +75,23 @@ def evaluate_and_validate(): else: print("All responses passed the evaluation.") + # # Run an entire suite of Evaluators as well + # eval_suite = [ + # DoesResponseAnswerQuery(model=eval_model), + # Faithfulness(model=eval_model), + # ContextContainsEnoughInformation(model=eval_model), + # ] + + # # Run the evaluation suite + # batch_eval_result = EvalRunner.run_suite( + # evals=eval_suite, + # data=dataset, + # max_parallel_evals=2 + # ) + + # # Validate the batch_eval_results as you want. + + if __name__ == "__main__": load_data() evaluate_and_validate() \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index a974705..f8985b1 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,3 @@ llama_index==0.9.40 -langchain-openai==0.0.3 openai==1.12.0 -python-dotenv==1.0.1 +python-dotenv==1.0.1 \ No newline at end of file