Skip to content

Commit

Permalink
adding a setting to be able to override the chunk size when embedding
Browse files Browse the repository at this point in the history
  • Loading branch information
shanbady committed Jan 21, 2025
1 parent 5a3e772 commit 9b7cef7
Show file tree
Hide file tree
Showing 2 changed files with 14 additions and 1 deletion.
6 changes: 6 additions & 0 deletions main/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -848,3 +848,9 @@ def get_all_config_keys():
AI_BUDGET_DURATION = get_string(name="AI_BUDGET_DURATION", default="60m")
AI_MAX_BUDGET = get_float(name="AI_MAX_BUDGET", default=0.05)
AI_ANON_LIMIT_MULTIPLIER = get_float(name="AI_ANON_LIMIT_MULTIPLIER", default=10.0)
CONTENT_FILE_EMBEDDING_CHUNK_SIZE_OVERRIDE = get_int(
name="CONTENT_FILE_EMBEDDING_CHUNK_SIZE", default=None
)
CONTENT_FILE_EMBEDDING_CHUNK_OVERLAP = get_int(
name="CONTENT_FILE_EMBEDDING_CHUNK_OVERLAP", default=50
)
9 changes: 8 additions & 1 deletion vector_search/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -178,14 +178,21 @@ def _get_text_splitter(encoder):
"""
Get the text splitter to use based on the encoder
"""
if settings.CONTENT_FILE_EMBEDDING_CHUNK_SIZE_OVERRIDE:
return RecursiveCharacterTextSplitter(
chunk_size=settings.CONTENT_FILE_EMBEDDING_CHUNK_SIZE_OVERRIDE,
chunk_overlap=settings.CONTENT_FILE_EMBEDDING_CHUNK_OVERLAP,
add_start_index=True,
separators=["\n\n", "\n", ".", " ", ""],
)
if hasattr(encoder, "token_encoding_name") and encoder.token_encoding_name:
# leverage tiktoken to ensure we stay within token limits
return TokenTextSplitter(encoding_name=encoder.token_encoding_name)
else:
# default for use with fastembed
return RecursiveCharacterTextSplitter(
chunk_size=512,
chunk_overlap=50,
chunk_overlap=settings.CONTENT_FILE_EMBEDDING_CHUNK_OVERLAP,
add_start_index=True,
separators=["\n\n", "\n", ".", " ", ""],
)
Expand Down

0 comments on commit 9b7cef7

Please sign in to comment.