-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathevaluate.py
230 lines (200 loc) · 8.49 KB
/
evaluate.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
# eval RAG
# eval Retrieval
# eval Respone
import json
import numpy as np
from langchain.evaluation import QAEvalChain
from langchain.prompts import ChatPromptTemplate, HumanMessagePromptTemplate
from utils import json_loader
from langchain.callbacks.manager import CallbackManager, trace_as_chain_group
LONG_FORM_ANSWER_PROMPT = HumanMessagePromptTemplate.from_template(
"""\
Create one or more statements from each sentence in the given answer.
question: Who was Albert Einstein and what is he best known for?
answer: He was a German-born theoretical physicist, widely acknowledged to be one of the greatest and most influential physicists of all time. He was best known for developing the theory of relativity, he also made important contributions to the development of the theory of quantum mechanics.
statements in json:
{{
"statements": [
"Albert Einstein was born in Germany.",
"Albert Einstein was best known for his theory of relativity."
]
}}
question: Cadmium Chloride is slightly soluble in this chemical, it is also called what?
answer: alcohol
statements in json:
{{
"statements": [
"Cadmium Chloride is slightly soluble in alcohol."
]
}}
question: Were Hitler and Benito Mussolini of the same nationality?
answer: Sorry, I can't provide answer to that question.
statements in json:
{{
"statements": []
}}
question:{question}
answer: {answer}
statements in json:""" # noqa: E501
)
CONTEXT_PRECISION = HumanMessagePromptTemplate.from_template(
"""\
Verify if the information in the given context is useful in answering the question.
question: What are the health benefits of green tea?
context:
This article explores the rich history of tea cultivation in China, tracing its roots back to the ancient dynasties. It discusses how different regions have developed their unique tea varieties and brewing techniques. The article also delves into the cultural significance of tea in Chinese society and how it has become a symbol of hospitality and relaxation.
verification:
{{"reason":"The context, while informative about the history and cultural significance of tea in China, does not provide specific information about the health benefits of green tea. Thus, it is not useful for answering the question about health benefits.", "verdict":"No"}}
question: How does photosynthesis work in plants?
context:
Photosynthesis in plants is a complex process involving multiple steps. This paper details how chlorophyll within the chloroplasts absorbs sunlight, which then drives the chemical reaction converting carbon dioxide and water into glucose and oxygen. It explains the role of light and dark reactions and how ATP and NADPH are produced during these processes.
verification:
{{"reason":"This context is extremely relevant and useful for answering the question. It directly addresses the mechanisms of photosynthesis, explaining the key components and processes involved.", "verdict":"Yes"}}
question:{question}
context:
{context}
verification:""" # noqa: E501
)
ground_truth = []
class Evaluate():
def __init__(self,llm, dataset):
self.dataset = dataset
self.llm = llm
def cores_batch(self):
question, answer, contexts = {
self.dataset["question"],
self.dataset["answer"],
self.dataset["context"]
}
prompts = []
for q, a in zip(question,answer):
human_prompt = LONG_FORM_ANSWER_PROMPT.format(question=q, answer=a)
prompts.append(ChatPromptTemplate.from_template([human_prompt]))
result = self.llm(prompts)
prompts = []
for context, output in zip(contexts, result):
pass
def context_precision(self):
prompts = []
questions = self.dataset["question"]
contexts = self.dataset["contexts"]
for question, context in zip(questions, contexts):
human_prompts = [
ChatPromptTemplate.from_template(
[CONTEXT_PRECISION.format(qestion=question,context=c)]
)
for c in context
]
prompts.extend(human_prompts)
responses: list[list[str]] = []
results = self.llm()
responses = [[i.text for i in r] for r in results]
context_lens = [len(ctx) for ctx in contexts]
context_lens.insert(0,0)
context_lens = np.cumsum(context_lens)
grouped_responses = [
responses[start:end] for start, end in zip(context_lens[:-1], context_lens[1:])
]
scores = []
for response in grouped_responses:
response = [
json_loader.safe_load(item,self.llm) for item in sum(response, [])
]
response = [
int("yes" in resp.get("verdict", " ").lower())
if resp.get("verdict")
else np.nan
for resp in response
]
denominator = sum(response) + 1e-10
numerator = sum(
[
(sum(response[: i + 1]) / (i + 1)) * response[i]
for i in range(len(response))
]
)
scores.append(numerator / denominator)
return scores
if __name__ == '__main__':
questions = ["What did the president say about Justice Breyer?",
"What did the president say about Intel's CEO?",
"What did the president say about gun violence?",
]
ground_truths = [["The president said that Justice Breyer has dedicated his life to serve the country and thanked him for his service."],
["The president said that Pat Gelsinger is ready to increase Intel's investment to $100 billion."],
["The president asked Congress to pass proven measures to reduce gun violence."]]
# eval RAG
# eval Retrieval
# eval Respone
# question, content, answrer_llm, ground_truth
contexts = []
questions = []
ground_truths = []
answers = []
with open('/home/rb025/Documents/PVP.txt') as file:
text = file.read()
sections = text.split('\n')
for i,section in enumerate(sections):
if i == 2:
break
elif i %2 == 0 and i<(len(sections)-1):
questions.append(section.strip())
elif i%2!=0:
ground_truths.append([section.strip()])
template = """You are an assistant for question-answering tasks.
Use the following pieces of retrieved context to answer the question.
If you don't know the answer, just say that you don't know.
Use two sentences maximum and keep the answer concise.
Question: {question}
Context: {context}
Answer:"""
prompt = ChatPromptTemplate.from_template(template)
retriever = vectorstore.as_retriever()
rag_chain = (
{"context": retriever, "question": RunnablePassthrough()}
| prompt
| llm
| StrOutputParser()
)
rag_pipeline = RetrievalQA.from_chain_type(
llm=llm, chain_type='stuff',
retriever=vectorstore.as_retriever()
)
df = pd.DataFrame(columns=["Query", "Ground Truth", "Context", "Answer","Evaluate_response","Evaluate_response"])
for query, ground_truth in zip (questions, ground_truths):
context = [docs.page_content for docs in retriever.get_relevant_documents(query)]
answer = rag_chain.invoke(query)
contexts.append(context)
answers.append(answer)
#eval = Evaluate(context=context, question=query,answer=answer['result'],ground_truth=ground_truth)
# evaluate_retrieval = eval.evaluate_retrieval()
# #evaluate_response = eval.evaluate_response()
# df = df.append({"Query": query, "Ground Truth": ground_truth, "Context": context, "Answer": answer['result'], "Evaluate_retrieval":evaluate_retrieval},
# ignore_index=True)
# print("Eval done")
data = {
"question": questions,
"answer": answers,
"contexts": contexts,
"ground_truths": ground_truths
}
dataset = Dataset.from_dict(data)
from ragas import evaluate
from ragas.metrics import (
faithfulness,
answer_relevancy,
context_precision,
context_recall,
)
result = evaluate(
dataset=dataset,
metrics=[
context_precision,
context_recall,
faithfulness,
answer_relevancy,
],
)
df = result.to_pandas()
# df.to_excel("evaluation_results.xlsx", index=False)
# print("Results saved to evaluation_results.xlsx")