Add title to embedded documents for better retrieval (#1732)

* Add title to embedded documents for better retrieval * [DERCBOT-1024] No need to specify the line number and URL, or duplicate the source content * [FIX - Intellij Kotlin Build] KAPT & Source directories * [DERCBOT-1024] fix * Revert "[FIX - Intellij Kotlin Build] KAPT & Source directories" This reverts commit 773c534. * [DERCBOT-1024] Reviews * Update gen-ai/orchestrator-server/src/main/python/tock-llm-indexing-tools/index_documents.py Co-authored-by: Diverrez morgan <[email protected]> --------- Co-authored-by: Pierre Guirriec <[email protected]> Co-authored-by: Diverrez morgan <[email protected]>
theopenconversationkit · Sep 2, 2024 · f4b4036 · f4b4036
1 parent 9f26c58
commit f4b4036
Show file tree

Hide file tree

Showing 2 changed files with 43 additions and 11 deletions.
diff --git a/...tor-server/src/main/python/server/src/gen_ai_orchestrator/services/langchain/rag_chain.py b/...tor-server/src/main/python/server/src/gen_ai_orchestrator/services/langchain/rag_chain.py
@@ -25,6 +25,7 @@
 
 from langchain.chains import ConversationalRetrievalChain
 from langchain.memory import ChatMessageHistory
+from langchain_core.documents import Document
 from langchain_core.prompts import PromptTemplate
 
 from gen_ai_orchestrator.errors.exceptions.exceptions import (
@@ -134,8 +135,8 @@ async def execute_qa_chain(query: RagQuery, debug: bool) -> RagResponse:
                     lambda doc: Footnote(
                         identifier=f'{doc.metadata["id"]}',
                         title=doc.metadata['title'],
-                        url=doc.metadata['url'],
-                        content=doc.page_content,
+                        url=doc.metadata['source'],
+                        content=get_source_content(doc),
                     ),
                     response['source_documents'],
                 )
@@ -148,6 +149,21 @@ async def execute_qa_chain(query: RagQuery, debug: bool) -> RagResponse:
         else None
     )
 
+def get_source_content(doc: Document) -> str:
+    """
+    Find and delete the title followed by two line breaks
+
+    The concatenation model used  is {title}\n\n{content_page}.
+    It is also used in chain_rag.py on the orchestrator server, when fetching sources.
+    The aim is to remove the ‘title’ prefix from the document content when sending the sources.
+
+    """
+    title_prefix = f"{doc.metadata['title']}\n\n"
+    if doc.page_content.startswith(title_prefix):
+        return doc.page_content[len(title_prefix):]
+    else:
+        return doc.page_content
+
 
 def create_rag_chain(query: RagQuery) -> ConversationalRetrievalChain:
     """
@@ -263,7 +279,7 @@ def get_rag_documents(handler: RetrieverJsonCallbackHandler) -> List[RagDocument
     return [
         # Get first 100 char of content
         RagDocument(
-            content=doc['page_content'][0:100] + '...',
+            content=doc['page_content'][0:len(doc['metadata']['title'])+100] + '...',
             metadata=RagDocumentMetadata(**doc['metadata']),
         )
         for doc in on_chain_start_records[0]['inputs']['input_documents']

diff --git a/gen-ai/orchestrator-server/src/main/python/tock-llm-indexing-tools/index_documents.py b/gen-ai/orchestrator-server/src/main/python/tock-llm-indexing-tools/index_documents.py
@@ -56,7 +56,7 @@
 from docopt import docopt
 from langchain.embeddings.base import Embeddings
 from langchain.text_splitter import RecursiveCharacterTextSplitter
-from langchain_community.document_loaders import CSVLoader
+from langchain_community.document_loaders.dataframe import DataFrameLoader
 from langchain_core.documents import Document
 
 from gen_ai_orchestrator.models.em.azureopenai.azure_openai_em_setting import (
@@ -100,13 +100,10 @@ def index_documents(args):
     )
 
     logging.debug(f"Read input CSV file {args['<input_csv>']}")
-    csv_loader = CSVLoader(
-        file_path=args['<input_csv>'],
-        source_column='url',
-        metadata_columns=('title', 'url'),
-        csv_args={'delimiter': '|', 'quotechar': '"'},
-    )
-    docs = csv_loader.load()
+    df = pd.read_csv(args['<input_csv>'], delimiter='|', quotechar='"', names=['title', 'source', 'text'])
+    loader = DataFrameLoader(df, page_content_column='text')
+    docs = loader.load()
+
     for doc in docs:
         doc.metadata['index_session_id'] = session_uuid
         doc.metadata['index_datetime'] = formatted_datetime
@@ -120,6 +117,8 @@ def index_documents(args):
     splitted_docs = text_splitter.split_documents(docs)
     # Add chunk id ('n/N') metadata to each chunk
     splitted_docs = generate_ids_for_each_chunks(splitted_docs=splitted_docs)
+    # Add title to text (for better semantic search)
+    splitted_docs = add_title_to_text(splitted_docs=splitted_docs)
 
     logging.debug(f"Get embeddings model from {args['<embeddings_cfg>']} config file")
     with open(args['<embeddings_cfg>'], 'r') as file:
@@ -164,6 +163,23 @@ def generate_ids_for_each_chunks(
     return splitted_docs
 
 
+def add_title_to_text(
+    splitted_docs: Iterable[Document],
+) -> Iterable[Document]:
+    """
+    Add 'title' from metadata to Document's page_content for better semantic search.
+
+    The concatenation model used when indexing data is {title}\n\n{content_page}.
+    The aim is to add the ‘title’ prefix from the document content when sending to embedding.
+    """
+    for doc in splitted_docs:
+        # Add title to page_content
+        if 'title' in doc.metadata:
+            title = doc.metadata['title']
+            doc.page_content = f'{title}\n\n{doc.page_content}'
+    return splitted_docs
+
+
 def em_settings_from_config(setting_dict: dict) -> BaseEMSetting:
     """Get embeddings settings from config dict."""
     # Create settings class according to embeddings provider from config file