Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Add HyDE and reranking to evaluation #18

Merged
merged 2 commits into from
Apr 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
56 changes: 48 additions & 8 deletions 02-household-queries/eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,15 @@
import csv
import itertools
from datetime import datetime
import os

from langchain_community.llms.ollama import Ollama
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import SentenceTransformerEmbeddings

from tqdm import tqdm
from openai import OpenAI
import cohere

import chromadb
from chromadb.config import Settings
Expand All @@ -30,6 +32,13 @@
Please answer using the following context:
{context}"""

HYDE_PROMPT = """Please write a hypothetical document that would answer the following question about SNAP (food stamps.)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
HYDE_PROMPT = """Please write a hypothetical document that would answer the following question about SNAP (food stamps.)
HYDE_PROMPT = """Please write a hypothetical document that would answer the following question about SNAP (food stamps).

The document should start by repeating the question in more generic format and then provide the answer.
The resulting Q&A should be in the style of a document that a caseworker would use to answer an applicant's question.
In total, the document should be about 200 words long.
Do not include disclaimers about "consulting with a SNAP eligiblity worker", etc.
Question: {question_text}"""

# From Phoenix Evals ("HUMAN_VS_AI_PROMPT_TEMPLATE")
EVAL_PROMPT = """You are comparing a human ground truth answer from an expert to an answer from an AI model.
Your goal is to determine if the AI answer correctly matches, in substance, the human answer.
Expand Down Expand Up @@ -87,12 +96,14 @@ def gpt_4_turbo(prompt):

parameters = {
# (size, overlap)
"chunk_size": [(128, 0)], # [(128, 0), (256, 0), (512, 256)],
"k": [5], # [0, 5, 10],
"model": [mistral_7b], # [gpt_4_turbo, mistral_7b],
"chunk_size": [(256, 0)],
"k": [5],
"reranking": [False],
"hyde": [False],
"model": [mistral_7b], # [gpt_4_turbo],
}

eval_llm_client = gpt_4_turbo
eval_llm_client = mistral_7b # gpt_4_turbo

with open("question_answer_citations.json", "r") as file:
questions = json.load(file)
Expand Down Expand Up @@ -141,14 +152,43 @@ def get_answer(question, parameters):
)
vector_db_chunk_size = parameters["chunk_size"]

docs = vector_db.similarity_search(question, k=parameters["k"])
context = "\n".join(set(doc.metadata["entire_card"] for doc in docs))
context_search = (
hyde(parameters["model"], question) if parameters["hyde"] else question
)

docs = vector_db.similarity_search(context_search, k=parameters["k"])
unique_cards = set(doc.metadata["entire_card"] for doc in docs)
reranked_cards = (
rerank(question, unique_cards) if parameters["reranking"] else unique_cards
)
context = "\n".join(reranked_cards)

return parameters["model"](
PROMPT_WITH_CONTEXT.format(question_text=question, context=context)
)


cohere_client = None


def rerank(question, docs):
global cohere_client
if not cohere_client:
cohere_client = cohere.Client(os.getenv("COHERE_API_KEY"))
results = cohere_client.rerank(
query=question,
documents=docs,
top_n=3,
model="rerank-english-v2.0",
return_documents=True,
)
return [r.document.text for r in results.results]


def hyde(model, question):
return model(HYDE_PROMPT.format(question_text=question))


################################################################
# Iterating through each question for a given set of parameters
################################################################
Expand All @@ -161,12 +201,12 @@ def evaluate(eval_llm_client, questions, parameters):
"question_id": question["id"],
"question_text": question["question"],
"correct_answer": question[
"orig_answer" if not "short_answer" in question else "answer"
"orig_answer" if "short_answer" not in question else "answer"
],
"ai_generated_answer": get_answer(question["question"], parameters),
}
result["evaluation"] = eval_llm_client(EVAL_PROMPT.format(**result))
result["correct"] = not "incorrect" in result["evaluation"].lower()
result["correct"] = "incorrect" not in result["evaluation"].lower()
result |= parameters
result["model"] = result["model"].__name__
results.append(result)
Expand Down
1 change: 1 addition & 0 deletions 02-household-queries/requirements.in
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
beautifulsoup4
chainlit
chromadb
cohere
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Reminder to run pip-compile requirements.in to update requirements.txt

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Read this comment, thought "ah right", and then pushed merge 😂 -- I'll update on the base branch now before merging.

dspy-ai
jinja2
jq
Expand Down