-
Notifications
You must be signed in to change notification settings - Fork 0
/
try_with_newPDF_v1.py
173 lines (144 loc) · 6.91 KB
/
try_with_newPDF_v1.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
"""
Created by: Urveshkumar Koshti
Purpose of the code: To create the Chatbot for the Question-Answer based on the PDF.
Attributes of the Local Chatbot:
--> Input to the Code (via Frontend interface): PDF file
Output from the code (on the Frontend interface): Text Generation
--> Saves the Chat History
--> Creates the Vector Database using SQL
--> Uses the same Vector Database For the Questions from the same PDF until new PDF is not uploaded by User
To run the File in Docker container:
--> Pull the image from the Docker Hub and then Run the Image.
--> Open the Chatbot by typing 'streamlit run 'Chatbot_app.py'.
"""
from langchain.chains import RetrievalQA
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain.callbacks.manager import CallbackManager
from langchain_community.llms import Ollama
from langchain_community.embeddings.ollama import OllamaEmbeddings
from langchain_community.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader
from langchain.prompts import PromptTemplate
from langchain.memory import ConversationBufferMemory
import streamlit as st
import os
import time
if not os.path.exists('files'):
os.mkdir('files')
if not os.path.exists('vector_database'):
os.mkdir('vector_database')
if 'template' not in st.session_state:
st.session_state.template = """You are a knowledgeable chatbot, here to help with questions of the user. Your tone should be professional and informative.
Context: {context}
History: {history}
User: {question}
Chatbot:"""
if 'prompt' not in st.session_state:
st.session_state.prompt = PromptTemplate(
input_variables=["history", "context", "question"],
template=st.session_state.template,
)
if 'memory' not in st.session_state:
st.session_state.memory = ConversationBufferMemory(
memory_key="history",
return_messages=True,
input_key="question")
if 'vectorstore' not in st.session_state:
st.session_state.vectorstore = Chroma(persist_directory='vector_database',
embedding_function=OllamaEmbeddings(base_url='http://localhost:11434',
model="mistral")
)
if 'llm' not in st.session_state:
st.session_state.llm = Ollama(base_url="http://localhost:11434",
model="mistral",
verbose=True,
callback_manager=CallbackManager(
[StreamingStdOutCallbackHandler()]),
)
# Function to clear the current vector store and reset session state variables
def clear_vectorstore():
if 'vectorstore' in st.session_state:
del st.session_state['vectorstore']
if 'retriever' in st.session_state:
del st.session_state['retriever']
if 'qa_chain' in st.session_state:
del st.session_state['qa_chain']
if 'chat_history' in st.session_state:
st.session_state.chat_history = []
# Initialize session state
if 'chat_history' not in st.session_state:
st.session_state.chat_history = []
st.title("PDF Chatbot")
# Upload a PDF file
uploaded_file = st.file_uploader("Upload your PDF", type='pdf')
if uploaded_file is not None:
if 'current_pdf' not in st.session_state or st.session_state.current_pdf != uploaded_file.name:
clear_vectorstore()
st.session_state.current_pdf = uploaded_file.name
st.info("Uploading and processing your PDF...")
start_time = time.time()
bytes_data = uploaded_file.read()
with open("files/" + uploaded_file.name + ".pdf", "wb") as f:
f.write(bytes_data)
upload_time = time.time() - start_time
st.success(f"PDF uploaded and saved in {upload_time:.2f} seconds.")
st.info("Analyzing your document...")
start_time = time.time()
loader = PyPDFLoader("files/" + uploaded_file.name + ".pdf")
data = loader.load()
# Initialize text splitter
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=1500,
chunk_overlap=200,
length_function=len
)
all_splits = text_splitter.split_documents(data)
# Create and persist the vector store
st.session_state.vectorstore = Chroma.from_documents(
documents=all_splits,
embedding=OllamaEmbeddings(model="mistral")
)
st.session_state.vectorstore.persist()
processing_time = time.time() - start_time
st.success(f"Document analyzed and embeddings created in {processing_time:.2f} seconds.")
st.session_state.retriever = st.session_state.vectorstore.as_retriever()
st.session_state.qa_chain = RetrievalQA.from_chain_type(
llm=st.session_state.llm,
chain_type='stuff',
retriever=st.session_state.retriever,
verbose=True,
chain_type_kwargs={
"verbose": True,
"prompt": st.session_state.prompt,
"memory": st.session_state.memory,
}
)
# Display chat history and handle user input
for message in st.session_state.chat_history:
with st.chat_message(message["role"]):
st.markdown(message["message"])
if uploaded_file is None:
st.write("Please upload a PDF file.")
if user_input := st.chat_input("You:", key="user_input"):
user_message = {"role": "user", "message": user_input}
st.session_state.chat_history.append(user_message)
with st.chat_message("user"):
st.markdown(user_input)
with st.chat_message("assistant"):
with st.spinner("Assistant is typing..."):
start_time = time.time()
response = st.session_state.qa_chain(user_input)
response_time = time.time() - start_time
message_placeholder = st.empty()
full_response = ""
words = response['result'].split()
# Typewriter effect in Streamlit
for word in words:
full_response += word + " "
message_placeholder.markdown(full_response + "▌")
time.sleep(0.05) # Adjust the delay to control the typing speed
message_placeholder.markdown(full_response)
st.success(f"Response generated in {response_time:.2f} seconds.")
chatbot_message = {"role": "assistant", "message": response['result']}
st.session_state.chat_history.append(chatbot_message)