adding a setting to be able to override the chunk size when embedding

mitodl · Jan 21, 2025 · 9b7cef7 · 9b7cef7
1 parent 5a3e772
commit 9b7cef7
Show file tree

Hide file tree

Showing 2 changed files with 14 additions and 1 deletion.
diff --git a/main/settings.py b/main/settings.py
@@ -848,3 +848,9 @@ def get_all_config_keys():
 AI_BUDGET_DURATION = get_string(name="AI_BUDGET_DURATION", default="60m")
 AI_MAX_BUDGET = get_float(name="AI_MAX_BUDGET", default=0.05)
 AI_ANON_LIMIT_MULTIPLIER = get_float(name="AI_ANON_LIMIT_MULTIPLIER", default=10.0)
+CONTENT_FILE_EMBEDDING_CHUNK_SIZE_OVERRIDE = get_int(
+    name="CONTENT_FILE_EMBEDDING_CHUNK_SIZE", default=None
+)
+CONTENT_FILE_EMBEDDING_CHUNK_OVERLAP = get_int(
+    name="CONTENT_FILE_EMBEDDING_CHUNK_OVERLAP", default=50
+)
diff --git a/vector_search/utils.py b/vector_search/utils.py
@@ -178,14 +178,21 @@ def _get_text_splitter(encoder):
     """
     Get the text splitter to use based on the encoder
     """
+    if settings.CONTENT_FILE_EMBEDDING_CHUNK_SIZE_OVERRIDE:
+        return RecursiveCharacterTextSplitter(
+            chunk_size=settings.CONTENT_FILE_EMBEDDING_CHUNK_SIZE_OVERRIDE,
+            chunk_overlap=settings.CONTENT_FILE_EMBEDDING_CHUNK_OVERLAP,
+            add_start_index=True,
+            separators=["\n\n", "\n", ".", " ", ""],
+        )
     if hasattr(encoder, "token_encoding_name") and encoder.token_encoding_name:
         # leverage tiktoken to ensure we stay within token limits
         return TokenTextSplitter(encoding_name=encoder.token_encoding_name)
     else:
         # default for use with fastembed
         return RecursiveCharacterTextSplitter(
             chunk_size=512,
-            chunk_overlap=50,
+            chunk_overlap=settings.CONTENT_FILE_EMBEDDING_CHUNK_OVERLAP,
             add_start_index=True,
             separators=["\n\n", "\n", ".", " ", ""],
         )