From 1ab26d7323ef35e73487972949d00b4818aa77d5 Mon Sep 17 00:00:00 2001 From: Simonas <20096648+simjak@users.noreply.github.com> Date: Wed, 17 Apr 2024 09:16:52 +0300 Subject: [PATCH 1/2] fix: Hard split for max token size --- semantic_router/splitters/rolling_window.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/semantic_router/splitters/rolling_window.py b/semantic_router/splitters/rolling_window.py index 2f80ff3b..fcd520e9 100644 --- a/semantic_router/splitters/rolling_window.py +++ b/semantic_router/splitters/rolling_window.py @@ -215,7 +215,7 @@ def _split_documents( logger.debug(f"Document token count: {doc_token_count} tokens") # Check if current index is a split point based on similarity if doc_idx + 1 in split_indices: - if current_tokens_count + doc_token_count >= self.min_split_tokens: + if self.min_split_tokens <= current_tokens_count + doc_token_count < self.max_split_tokens: # Include the current document before splitting # if it doesn't exceed the max limit current_split.append(doc) From 86bea9891dc11c48b1fccf8db503af454a2b3a20 Mon Sep 17 00:00:00 2001 From: Simonas <20096648+simjak@users.noreply.github.com> Date: Wed, 17 Apr 2024 09:21:20 +0300 Subject: [PATCH 2/2] chore: Formatting --- semantic_router/splitters/rolling_window.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/semantic_router/splitters/rolling_window.py b/semantic_router/splitters/rolling_window.py index fcd520e9..b3ca6279 100644 --- a/semantic_router/splitters/rolling_window.py +++ b/semantic_router/splitters/rolling_window.py @@ -215,7 +215,11 @@ def _split_documents( logger.debug(f"Document token count: {doc_token_count} tokens") # Check if current index is a split point based on similarity if doc_idx + 1 in split_indices: - if self.min_split_tokens <= current_tokens_count + doc_token_count < self.max_split_tokens: + if ( + self.min_split_tokens + <= current_tokens_count + doc_token_count + < self.max_split_tokens + ): # Include the current document before splitting # if it doesn't exceed the max limit current_split.append(doc)