-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
12 changed files
with
397 additions
and
31 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Large diffs are not rendered by default.
Oops, something went wrong.
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,134 @@ | ||
import fire | ||
from langsmith import Client | ||
from datetime import datetime, timedelta | ||
from tqdm import tqdm | ||
from deepeval.test_case import LLMTestCase | ||
from deepeval.dataset import EvaluationDataset | ||
from deepeval.metrics.ragas import RAGASAnswerRelevancyMetric, RAGASFaithfulnessMetric | ||
# from deepeval.metrics import ContextualRelevancyMetric, FaithfulnessMetric, AnswerRelevancyMetric | ||
import datasets | ||
from ragas import evaluate | ||
from ragas.metrics import answer_relevancy, faithfulness, context_relevancy, context_utilization | ||
import json | ||
import csv | ||
|
||
|
||
class EvaluationProcessor: | ||
def __init__(self): | ||
self.client = Client() | ||
|
||
def _get_prompt(self, original_text, content_to_remove): | ||
""" | ||
Supprime chaque ligne de content_to_remove dans le texte original. | ||
Paramètres: | ||
original_text (str): Le texte original contenant les lignes à supprimer. | ||
content_to_remove (str): Le contenu contenant les lignes à supprimer. | ||
Retourne: | ||
str: Le texte original avec les lignes spécifiées supprimées. | ||
""" | ||
original_lines = original_text.splitlines() | ||
lines_to_remove = set(content_to_remove.splitlines()) | ||
|
||
filtered_lines = [line for line in original_lines if line not in lines_to_remove] | ||
return '\n'.join(filtered_lines) | ||
|
||
def _format_documents(self, docs): | ||
formatted_output = "" | ||
for item in docs: | ||
formatted_output += item["page_content"] + "\n\n" | ||
return formatted_output | ||
|
||
def _save_results(self, results, output_format): | ||
if output_format == 'json': | ||
with open('results.json', 'w', encoding='utf-8') as f: | ||
json.dump(results, f, ensure_ascii=False, indent=4) | ||
elif output_format == 'csv': | ||
keys = results[0].keys() | ||
with open('results.csv', 'w', newline='', encoding='utf-8') as f: | ||
dict_writer = csv.DictWriter(f, fieldnames=keys) | ||
dict_writer.writeheader() | ||
dict_writer.writerows(results) | ||
else: | ||
raise ValueError("Unsupported output format. Please use 'csv' or 'json'.") | ||
|
||
def _process_runs(self, project_name, days=None, run_ids_file=None, output_format='csv'): | ||
if days: | ||
start_time = datetime.utcnow() - timedelta(days=days) | ||
runs = self.client.list_runs(project_name=project_name, start_time=start_time, is_root=True) | ||
elif run_ids_file: | ||
with open(run_ids_file, 'r') as file: | ||
run_ids = [line.strip() for line in file.readlines()] | ||
runs = self.client.list_runs(project_name=project_name, run_ids=run_ids) | ||
else: | ||
raise ValueError("Either 'days' or 'run_ids_file' must be specified.") | ||
|
||
results = [] | ||
for run in tqdm(runs, desc="Processing runs"): | ||
child_runs = self.client.list_runs(trace_id=run.trace_id) | ||
for child_run in child_runs: | ||
if child_run.name == "Retriever": | ||
docs = [doc['page_content'] for doc in child_run.outputs['documents']] | ||
format_docs = self._format_documents(child_run.outputs['documents']) | ||
if child_run.name == "ChatOpenAI": | ||
if "SystemMessage" in child_run.inputs['messages'][0]['id']: | ||
sys_message = child_run.inputs['messages'][0]['kwargs']['content'] | ||
prompt = self._get_prompt(sys_message, format_docs) | ||
results.append({ | ||
'run_id': run.id, | ||
'prompt': prompt, | ||
'question': run.inputs['question'], | ||
'answer': run.outputs['answer'] if run.outputs is not None else "", | ||
'formatted_docs': format_docs, | ||
'chat_history': run.inputs['chat_history'], | ||
'docs': docs | ||
}) | ||
return results | ||
|
||
def save_runs(self, project_name, days=None, run_ids_file=None, output_format="csv"): | ||
results = self._process_runs(project_name, days, run_ids_file) | ||
self._save_results(results, output_format) | ||
|
||
def ragas_eval(self, project_name, days=None, run_ids_file=None, output_format='csv'): | ||
results = self._process_runs(project_name, days, run_ids_file) | ||
ragas_dataset = {"question": [], "answer": [], "ground_truths": [], "contexts": []} | ||
|
||
for result in results: | ||
ragas_dataset["question"].append(result['question']) | ||
ragas_dataset["answer"].append(result['answer']) | ||
ragas_dataset["ground_truths"].append([]) | ||
ragas_dataset["contexts"].append(result['docs']) | ||
|
||
hf_ragas_dataset = datasets.Dataset.from_dict(ragas_dataset) | ||
evaluation_result = evaluate( | ||
hf_ragas_dataset, | ||
raise_exceptions=False, | ||
metrics=[ | ||
context_relevancy, | ||
faithfulness, | ||
answer_relevancy, | ||
context_utilization | ||
] | ||
) | ||
df = evaluation_result.to_pandas() | ||
if output_format == "csv": | ||
df.to_csv("ragas_evaluation_results.csv", index=False) | ||
elif output_format == "json": | ||
df.to_json("ragas_evaluation_results.json", index=False) | ||
|
||
def deepeval(self, project_name, days=None, run_ids_file=None): | ||
results = self._process_runs(project_name, days, run_ids_file) | ||
dataset = EvaluationDataset() | ||
for result in results: | ||
test_case = LLMTestCase(input=result['question'], | ||
actual_output=result['answer'], | ||
retrieval_context=result['docs'], | ||
) | ||
dataset.add_test_case(test_case) | ||
faithfulness = RAGASFaithfulnessMetric(threshold=0.5) | ||
relevancy = RAGASAnswerRelevancyMetric() | ||
dataset.evaluate([faithfulness, relevancy]) | ||
|
||
if __name__ == '__main__': | ||
fire.Fire(EvaluationProcessor) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,23 @@ | ||
ea84af53-f4ce-4eeb-b5bd-42bfd4d9d06e | ||
fcdc9c6f-dc4c-446d-a7fc-7552bb8e0643 | ||
b5291b99-c6e8-4e9a-a454-9047dabe7bcd | ||
c9fa9be2-d350-4f0d-ad7e-eac5125a4740 | ||
e5dfc187-e24a-45ab-aa0e-614a5d0796b0 | ||
d2cbc6a7-fead-4d98-a864-61997aedc9ea | ||
02be1332-c9ef-4b5f-b173-92b52780f3e1 | ||
48e78219-b27f-425b-8242-bba7091354d3 | ||
2666c586-fc22-40c4-b86b-2850f6dfd8b1 | ||
a560b2cf-5f51-49e6-a5b8-254cc4381f57 | ||
0827e602-8df4-4ddc-98c6-899ae91f2253 | ||
fd983384-e78f-4bbc-9487-af51bfe26e16 | ||
71aa2783-2026-44cb-ae6d-c9d157063105 | ||
8d8e5d06-aba0-4292-ac34-58b44f98fdc6 | ||
4580340f-e146-4fa0-abaa-5128b60a02c4 | ||
36be4151-62cd-4824-8ae3-69e929d92e7e | ||
ac339665-2f43-40c7-bc0d-50bffcad7d61 | ||
c4a188df-2ad6-4236-97ec-8bfc18cf6b91 | ||
895e76be-f6fc-4c56-9cd4-3469bdd40dba | ||
9528ddb3-f62c-4c83-b028-dcb73d2d7683 | ||
f3aa4d09-e8ab-4cd5-a159-f43fddbfc47d | ||
e657af9f-a4da-4861-83b7-315c9f4be594 | ||
8d0a4e00-5e3b-4184-8166-b09b60625a9e |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,12 +1,9 @@ | ||
nbdime==3.1.1 | ||
python-lsp-server[all]==1.4.1 | ||
jupyterlab-lsp== 3.10.1 | ||
nbqa==1.3.1 | ||
jupyterlab-tabular-data-editor==1.0.0 | ||
csvs-to-sqlite==1.3 | ||
ipython>=8.10.0 # not directly required, pinned by Snyk to avoid a vulnerability | ||
numpy>=1.22.2 # not directly required, pinned by Snyk to avoid a vulnerability | ||
setuptools>=65.5.1 # not directly required, pinned by Snyk to avoid a vulnerability | ||
jupyter-server>=2.11.2 # not directly required, pinned by Snyk to avoid a vulnerability | ||
tornado>=6.3.3 # not directly required, pinned by Snyk to avoid a vulnerability | ||
pipdeptree==2.2.1 | ||
urllib3>=2.2.2 # not directly required, pinned by Snyk to avoid a vulnerability |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters