Skip to content

Commit

Permalink
Merge pull request #37 from TheDataGuild/feature/useful-welcome-message
Browse files Browse the repository at this point in the history
Feature/useful welcome message
  • Loading branch information
Quantisan authored Sep 27, 2023
2 parents 83a302b + db74fe3 commit 24d8f75
Show file tree
Hide file tree
Showing 3 changed files with 42 additions and 20 deletions.
22 changes: 12 additions & 10 deletions mind_palace/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,39 +2,41 @@
import index
import openai
import streamlit as st
import welcome
from llama_index.query_engine import CitationQueryEngine

openai.api_key = st.secrets.openai_key
xml_dir = "./resources/xmls/12-pdfs-from-steve-aug-22/"
gpt_model = "gpt-3.5-turbo"

st.set_page_config(page_title="Chatting with Steve's PDFs")
st.title("Chat with Steve's 12 PDFs 💬🦙")
st.title("Chat with Steve's PDFs 💬")

with st.sidebar:
st.markdown("Conversation History")
st.text("Coming soon...")


if "messages" not in st.session_state.keys(): # Initialize the chat messages history
st.session_state.messages = [
{"role": "assistant", "content": "Ask me a question about these PDFs"}
]


@st.cache_resource(show_spinner=False)
def load_index(model):
def load_nodes_and_index(xml_dir, model):
with st.spinner(
text="Loading and indexing the PDFs – hang tight! This should take 1-2 minutes."
):
nodes = extract.seed_nodes(xml_dir)
vector_index = index.index_nodes(nodes, model)
return vector_index
return nodes, vector_index


vector_index = load_index(gpt_model)
nodes, vector_index = load_nodes_and_index(xml_dir, gpt_model)
query_engine = CitationQueryEngine.from_args(index=vector_index, verbose=True)


if "messages" not in st.session_state.keys(): # Initialize the chat messages history
st.session_state.messages = [
{"role": "assistant", "content": welcome.get_welcome_message(nodes)}
]


if prompt := st.chat_input(
"Your question"
): # Prompt for user input and save to chat history
Expand Down
38 changes: 28 additions & 10 deletions mind_palace/docs.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,27 @@
from enum import Enum, auto

import grobid_tei_xml
from llama_index.schema import NodeRelationship, RelatedNodeInfo, TextNode


class Section(Enum):
TITLE = auto()
ABSTRACT = auto()
BODY = auto()

def __str__(self) -> str:
return self.name.lower()


def create_text_node(node_id, text, section: Section, paragraph_number=None):
return TextNode(
text=text,
metadata={"section": str(section), "paragraph_number": paragraph_number},
excluded_embed_metadata_keys=["section"],
id_=node_id,
)


def load_tei_xml(file_path):
print(f"Loading {file_path}")
with open(file_path, "r") as xml_file:
Expand All @@ -25,16 +45,14 @@ def cite(xml):


def title(xml, doc_id):
return TextNode(
text=xml.header.title,
id_=f"{doc_id}-title",
return create_text_node(
node_id=f"{doc_id}-title", text=xml.header.title, section=Section.TITLE
)


def abstract(xml, doc_id):
return TextNode(
text=xml.abstract,
id_=f"{doc_id}-abstract",
return create_text_node(
node_id=f"{doc_id}-abstract", text=xml.abstract, section=Section.ABSTRACT
)


Expand All @@ -56,12 +74,12 @@ def set_next_relationships(nodes):

def body(xml, doc_id):
"""A naive implementation of body extraction"""
# TODO: Improve body extraction
return [
TextNode(
create_text_node(
node_id=f"{doc_id}-body-paragraph-{index}",
text=line,
metadata={"paragraph_number": index + 1},
id_=f"{doc_id}-body-paragraph-{index}",
section=Section.BODY,
paragraph_number=index + 1,
)
for index, line in enumerate(xml.body.split("\n"))
]
Expand Down
2 changes: 2 additions & 0 deletions mind_palace/welcome.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
def get_welcome_message(nodes):
return "Ask me a question about these PDFs"

0 comments on commit 24d8f75

Please sign in to comment.