Skip to content

Commit

Permalink
WIP
Browse files Browse the repository at this point in the history
  • Loading branch information
yoomlam committed May 2, 2024
1 parent 68b184b commit d18f146
Show file tree
Hide file tree
Showing 5 changed files with 868 additions and 12 deletions.
7 changes: 5 additions & 2 deletions 02-household-queries/dspy_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -177,7 +177,7 @@ def create_retriever_model():


@debugging.timer
def create_llm_model(llm_name="openhermes"):
def create_llm_model(llm_name="openhermes", respond_with_json=False):
print("LLM model name:", llm_name)
if llm_name in ["openhermes", "llama2", "llama2:chat", "llama3", "mistral", "mistral:instruct"]:
# Alternative using OpenAI-compatible API: https://gist.github.com/jrknox1977/78c17e492b5a75ee5bbaf9673aee4641
Expand All @@ -188,7 +188,10 @@ def create_llm_model(llm_name="openhermes"):
"gpt-4",
"gpt-4-turbo",
]:
return dspy.OpenAI(model=llm_name, temperature=0.1, response_format={"type": "json_object"})
if respond_with_json:
return dspy.OpenAI(model=llm_name, temperature=0.1, response_format={"type": "json_object"})
else:
return dspy.OpenAI(model=llm_name, temperature=0.1)
elif llm_name in ["gemini-1.0-pro"]:
return dspy.Google(model=f"models/{llm_name}", temperature=0.1)
elif llm_name in ["llama3-70b-8192", "mixtral-8x7b-32768"]:
Expand Down
40 changes: 30 additions & 10 deletions 02-household-queries/ingest.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,20 +122,16 @@ def add_json_html_data_to_vector_db(
chunk_overlap=300,
silent=False,
):
data_file = open(file_path, encoding="utf-8")
json_data = json.load(data_file)
question_answers = extract_qa_text_from_guru(file_path, content_key, index_key)

if embedding_name:
check_embedding(chunk_size, get_embeddings().get(embedding_name, ""))
for content in json_data:
if not content[index_key].strip().endswith("?"):
continue
soup = BeautifulSoup(content[content_key], "html.parser")
text = soup.get_text(separator="\n", strip=True)
for question, answer in question_answers:
if not silent:
print("Processing document:", content[index_key])
print("Processing document:", question)
chunks = get_text_chunks_langchain(
text,
content[index_key],
answer,
question,
chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
token_limit=token_limit,
Expand All @@ -145,6 +141,30 @@ def add_json_html_data_to_vector_db(
vectordb.add_documents(documents=chunks)


def extract_qa_text_from_guru(
file_path="./guru_cards_for_nava.json", content_key="content", index_key="preferredPhrase"
):
json_data = load_guru_cards(file_path)
question_answers = extract_question_answers(index_key, content_key, json_data)
return question_answers


def extract_question_answers(question_key, answer_key, json_data):
question_answers = {}
for content in json_data:
if not content[question_key].strip().endswith("?"):
continue
soup = BeautifulSoup(content[answer_key], "html.parser")
answer = soup.get_text(separator="\n", strip=True)
question_answers[content[question_key]] = answer
return question_answers


def load_guru_cards(file_path):
data_file = open(file_path, encoding="utf-8")
return json.load(data_file)


def ingest_call(
vectordb,
embedding_name=None,
Expand Down
Loading

0 comments on commit d18f146

Please sign in to comment.