WIP

navapbc · May 2, 2024 · d18f146 · d18f146
1 parent 68b184b
commit d18f146
Show file tree

Hide file tree

Showing 5 changed files with 868 additions and 12 deletions.
diff --git a/02-household-queries/dspy_engine.py b/02-household-queries/dspy_engine.py
@@ -177,7 +177,7 @@ def create_retriever_model():
 
 
 @debugging.timer
-def create_llm_model(llm_name="openhermes"):
+def create_llm_model(llm_name="openhermes", respond_with_json=False):
  print("LLM model name:", llm_name)
  if llm_name in ["openhermes", "llama2", "llama2:chat", "llama3", "mistral", "mistral:instruct"]:
  # Alternative using OpenAI-compatible API: https://gist.github.com/jrknox1977/78c17e492b5a75ee5bbaf9673aee4641
@@ -188,7 +188,10 @@ def create_llm_model(llm_name="openhermes"):
  "gpt-4",
  "gpt-4-turbo",
  ]:
- return dspy.OpenAI(model=llm_name, temperature=0.1, response_format={"type": "json_object"})
+ if respond_with_json:
+ return dspy.OpenAI(model=llm_name, temperature=0.1, response_format={"type": "json_object"})
+ else:
+ return dspy.OpenAI(model=llm_name, temperature=0.1)
  elif llm_name in ["gemini-1.0-pro"]:
  return dspy.Google(model=f"models/{llm_name}", temperature=0.1)
  elif llm_name in ["llama3-70b-8192", "mixtral-8x7b-32768"]:

diff --git a/02-household-queries/ingest.py b/02-household-queries/ingest.py
@@ -122,20 +122,16 @@ def add_json_html_data_to_vector_db(
  chunk_overlap=300,
  silent=False,
 ):
- data_file = open(file_path, encoding="utf-8")
- json_data = json.load(data_file)
+ question_answers = extract_qa_text_from_guru(file_path, content_key, index_key)
+
  if embedding_name:
  check_embedding(chunk_size, get_embeddings().get(embedding_name, ""))
- for content in json_data:
- if not content[index_key].strip().endswith("?"):
- continue
- soup = BeautifulSoup(content[content_key], "html.parser")
- text = soup.get_text(separator="\n", strip=True)
+ for question, answer in question_answers:
  if not silent:
- print("Processing document:", content[index_key])
+ print("Processing document:", question)
  chunks = get_text_chunks_langchain(
- text,
- content[index_key],
+ answer,
+ question,
  chunk_size=chunk_size,
  chunk_overlap=chunk_overlap,
  token_limit=token_limit,
@@ -145,6 +141,30 @@ def add_json_html_data_to_vector_db(
  vectordb.add_documents(documents=chunks)
 
 
+def extract_qa_text_from_guru(
+ file_path="./guru_cards_for_nava.json", content_key="content", index_key="preferredPhrase"
+):
+ json_data = load_guru_cards(file_path)
+ question_answers = extract_question_answers(index_key, content_key, json_data)
+ return question_answers
+
+
+def extract_question_answers(question_key, answer_key, json_data):
+ question_answers = {}
+ for content in json_data:
+ if not content[question_key].strip().endswith("?"):
+ continue
+ soup = BeautifulSoup(content[answer_key], "html.parser")
+ answer = soup.get_text(separator="\n", strip=True)
+ question_answers[content[question_key]] = answer
+ return question_answers
+
+
+def load_guru_cards(file_path):
+ data_file = open(file_path, encoding="utf-8")
+ return json.load(data_file)
+
+
 def ingest_call(
  vectordb,
  embedding_name=None,