Skip to content

Commit

Permalink
Add title to embedded documents for better retrieval (#1732)
Browse files Browse the repository at this point in the history
* Add title to embedded documents for better retrieval

* [DERCBOT-1024] No need to specify the line number and URL, or duplicate the source content

* [FIX - Intellij Kotlin Build] KAPT & Source directories

* [DERCBOT-1024] fix

* Revert "[FIX - Intellij Kotlin Build] KAPT & Source directories"

This reverts commit 773c534.

* [DERCBOT-1024] Reviews

* Update gen-ai/orchestrator-server/src/main/python/tock-llm-indexing-tools/index_documents.py

Co-authored-by: Diverrez morgan <[email protected]>

---------

Co-authored-by: Pierre Guirriec <[email protected]>
Co-authored-by: Diverrez morgan <[email protected]>
  • Loading branch information
3 people authored Sep 2, 2024
1 parent 9f26c58 commit f4b4036
Show file tree
Hide file tree
Showing 2 changed files with 43 additions and 11 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@

from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ChatMessageHistory
from langchain_core.documents import Document
from langchain_core.prompts import PromptTemplate

from gen_ai_orchestrator.errors.exceptions.exceptions import (
Expand Down Expand Up @@ -134,8 +135,8 @@ async def execute_qa_chain(query: RagQuery, debug: bool) -> RagResponse:
lambda doc: Footnote(
identifier=f'{doc.metadata["id"]}',
title=doc.metadata['title'],
url=doc.metadata['url'],
content=doc.page_content,
url=doc.metadata['source'],
content=get_source_content(doc),
),
response['source_documents'],
)
Expand All @@ -148,6 +149,21 @@ async def execute_qa_chain(query: RagQuery, debug: bool) -> RagResponse:
else None
)

def get_source_content(doc: Document) -> str:
"""
Find and delete the title followed by two line breaks
The concatenation model used is {title}\n\n{content_page}.
It is also used in chain_rag.py on the orchestrator server, when fetching sources.
The aim is to remove the ‘title’ prefix from the document content when sending the sources.
"""
title_prefix = f"{doc.metadata['title']}\n\n"
if doc.page_content.startswith(title_prefix):
return doc.page_content[len(title_prefix):]
else:
return doc.page_content


def create_rag_chain(query: RagQuery) -> ConversationalRetrievalChain:
"""
Expand Down Expand Up @@ -263,7 +279,7 @@ def get_rag_documents(handler: RetrieverJsonCallbackHandler) -> List[RagDocument
return [
# Get first 100 char of content
RagDocument(
content=doc['page_content'][0:100] + '...',
content=doc['page_content'][0:len(doc['metadata']['title'])+100] + '...',
metadata=RagDocumentMetadata(**doc['metadata']),
)
for doc in on_chain_start_records[0]['inputs']['input_documents']
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@
from docopt import docopt
from langchain.embeddings.base import Embeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import CSVLoader
from langchain_community.document_loaders.dataframe import DataFrameLoader
from langchain_core.documents import Document

from gen_ai_orchestrator.models.em.azureopenai.azure_openai_em_setting import (
Expand Down Expand Up @@ -100,13 +100,10 @@ def index_documents(args):
)

logging.debug(f"Read input CSV file {args['<input_csv>']}")
csv_loader = CSVLoader(
file_path=args['<input_csv>'],
source_column='url',
metadata_columns=('title', 'url'),
csv_args={'delimiter': '|', 'quotechar': '"'},
)
docs = csv_loader.load()
df = pd.read_csv(args['<input_csv>'], delimiter='|', quotechar='"', names=['title', 'source', 'text'])
loader = DataFrameLoader(df, page_content_column='text')
docs = loader.load()

for doc in docs:
doc.metadata['index_session_id'] = session_uuid
doc.metadata['index_datetime'] = formatted_datetime
Expand All @@ -120,6 +117,8 @@ def index_documents(args):
splitted_docs = text_splitter.split_documents(docs)
# Add chunk id ('n/N') metadata to each chunk
splitted_docs = generate_ids_for_each_chunks(splitted_docs=splitted_docs)
# Add title to text (for better semantic search)
splitted_docs = add_title_to_text(splitted_docs=splitted_docs)

logging.debug(f"Get embeddings model from {args['<embeddings_cfg>']} config file")
with open(args['<embeddings_cfg>'], 'r') as file:
Expand Down Expand Up @@ -164,6 +163,23 @@ def generate_ids_for_each_chunks(
return splitted_docs


def add_title_to_text(
splitted_docs: Iterable[Document],
) -> Iterable[Document]:
"""
Add 'title' from metadata to Document's page_content for better semantic search.
The concatenation model used when indexing data is {title}\n\n{content_page}.
The aim is to add the ‘title’ prefix from the document content when sending to embedding.
"""
for doc in splitted_docs:
# Add title to page_content
if 'title' in doc.metadata:
title = doc.metadata['title']
doc.page_content = f'{title}\n\n{doc.page_content}'
return splitted_docs


def em_settings_from_config(setting_dict: dict) -> BaseEMSetting:
"""Get embeddings settings from config dict."""
# Create settings class according to embeddings provider from config file
Expand Down

0 comments on commit f4b4036

Please sign in to comment.