-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdocubot.py
executable file
·147 lines (123 loc) · 4.86 KB
/
docubot.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
#!/usr/bin/env python3
"""
This module provides the entry point for the DocuBot application. It includes functions for building a knowledge base, initializing the application, and running the chatbot. The main function `docubot()` creates an argparse to parse the data directory as a mandatory argument, loads API keys, creates or loads an existing index, and instantiates the QA chatbot.
"""
from typing import List, TypeVar
T = TypeVar("T")
def chatbot(vector_store):
"""
A chatbot function that allows users to ask questions and receive answers with citations.
Args:
vector_store (list): A list of vectors used for semantic search.
Returns:
None
"""
import time
from docubot_utils.docubot_utils import answer_question_session, format_citations
i = 1
print("Welcome to the chatbot. Type 'quit' or 'exit' to exit")
chat_history = []
while True:
question = input(f"Question #{i}: ")
if question.lower() in ["quit", "exit"]:
print("Exiting chatbot")
time.sleep(2)
break
result, chat_history = answer_question_session(
question, vector_store=vector_store, chat_history=chat_history
)
print(f"\nAnswer #{i}: {result['answer']}")
# Join vector in result['source_document'] into a string seperated by newline character
citations = format_citations(result["source_documents"])
print(f"\nCitations #{i}:\n{citations}")
print(f"\n {'-' * 50} \n")
i += 1
def build_kb(data_directory: str) -> List[T]:
"""
Builds a knowledge base by loading documents from the given data directory.
Args:
data_directory (str): The path to the directory containing the documents.
Returns:
List[T]: A list of chunks, where each chunk contains 512 tokens.
"""
import os
from tqdm import tqdm
from document_loaders.document_loaders import load_document, chunk_data
list_of_doc_names = []
# List of supported document types
extensions = (".pdf", ".docx", ".md", ".txt")
for root, _, files in os.walk(data_directory):
for file in files:
if file.endswith(extensions):
list_of_doc_names.append(os.path.join(root, file))
else:
continue
# This is the knowledge base that is chunked into 512 tokens per chunk
data = []
print(f"Splitting documents into chunks")
for doc_name in tqdm(list_of_doc_names):
data.extend(load_document(doc_name))
print(f"There are {len(data)} pages in the knowledge base")
chunks = chunk_data(data, chunk_size=512, chunk_overlap=20)
print(f"These have been split into {len(chunks)} chunks for indexing")
return chunks
def init():
"""
Initializes Pinecone API with API key and environment variables.
"""
from dotenv import load_dotenv, find_dotenv
import pinecone
import os
# Load API keys
load_dotenv(find_dotenv(), override=True)
pinecone.init(
api_key=os.environ.get("PINECONE_API_KEY"),
environment=os.environ.get("PINECONE_ENV"),
)
def docubot():
"""
This function is the entry point of the DocuBot application. It creates an argparse to parse data directory which is a mandatory argument.
It loads API keys and creates or loads existing index and instantiate QA chatbot.
"""
import argparse
import pinecone
from pinecone_utils.pinecone_utils import (
create_index,
create_vector_store,
fetch_vector_store,
delete_pinecone_index,
)
from langchain.vectorstores import Pinecone
parser = argparse.ArgumentParser(description="DocuBot")
parser.add_argument("docs_dir", type=str, help="Path to the documents directory")
parser.add_argument(
"--index_name",
type=str,
default="docubot-index",
help="Descriptive name for backend vector db index",
)
parser.add_argument(
"--reload",
type=str,
help="Name of index to reload. If 'all', all indices will be deleted.",
)
args = parser.parse_args()
docs_directory = args.docs_dir
index_name = args.index_name
reload_index_name = args.reload
print(f"Instantiating DocuBot for {docs_directory}")
init()
vector_store: Pinecone = None
if reload_index_name:
# Delete all pinecone indices. As a free user, we can only have one index at a time.
delete_pinecone_index(reload_index_name)
create_index(index_name)
vector_store = create_vector_store(index_name, chunks=build_kb(docs_directory))
elif index_name in pinecone.list_indexes():
vector_store = fetch_vector_store(index_name)
else:
create_index(index_name)
vector_store = create_vector_store(index_name, chunks=build_kb(docs_directory))
chatbot(vector_store=vector_store)
if __name__ == "__main__":
docubot()