add unit tests for both algorithms with max chunk limit

Signed-off-by: yuye-aws <[email protected]>
yuye-aws · Mar 15, 2024 · ad7ba25 · ad7ba25
1 parent d245a04
commit ad7ba25
Show file tree

Hide file tree

Showing 2 changed files with 132 additions and 0 deletions.
diff --git a/src/test/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunkerTests.java b/src/test/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunkerTests.java
@@ -11,6 +11,8 @@
 import org.junit.Assert;
 import org.opensearch.test.OpenSearchTestCase;
 
+import static org.opensearch.neuralsearch.processor.TextChunkingProcessor.TYPE;
+import static org.opensearch.neuralsearch.processor.chunker.Chunker.MAX_CHUNK_LIMIT_FIELD;
 import static org.opensearch.neuralsearch.processor.chunker.DelimiterChunker.DELIMITER_FIELD;
 
 public class DelimiterChunkerTests extends OpenSearchTestCase {
@@ -73,4 +75,48 @@ public void testChunk_withDoubleNewlineDelimiter_thenSucceed() {
         List<String> chunkResult = chunker.chunk(content, Map.of());
         assertEquals(List.of("\n\n", "a\n\n", "\n"), chunkResult);
     }
+
+    public void testChunk_whenExceedMaxChunkLimit_thenFail() {
+        int maxChunkLimit = 2;
+        DelimiterChunker chunker = new DelimiterChunker(Map.of(DELIMITER_FIELD, "\n\n", MAX_CHUNK_LIMIT_FIELD, maxChunkLimit));
+        String content = "\n\na\n\n\n";
+        IllegalStateException illegalStateException = assertThrows(IllegalStateException.class, () -> chunker.chunk(content, Map.of()));
+        assert (illegalStateException.getMessage()
+            .contains(
+                String.format(
+                    Locale.ROOT,
+                    "The number of chunks produced by %s processor has exceeded the allowed maximum of [%s].",
+                    TYPE,
+                    maxChunkLimit
+                )
+            ));
+    }
+
+    public void testChunk_whenWithinMaxChunkLimit_thenSucceed() {
+        int maxChunkLimit = 3;
+        DelimiterChunker chunker = new DelimiterChunker(Map.of(DELIMITER_FIELD, "\n\n", MAX_CHUNK_LIMIT_FIELD, maxChunkLimit));
+        String content = "\n\na\n\n\n";
+        List<String> chunkResult = chunker.chunk(content, Map.of());
+        assertEquals(List.of("\n\n", "a\n\n", "\n"), chunkResult);
+    }
+
+    public void testChunk_whenExceedRuntimeMaxChunkLimit_thenFail() {
+        int maxChunkLimit = 3;
+        DelimiterChunker chunker = new DelimiterChunker(Map.of(DELIMITER_FIELD, "\n\n", MAX_CHUNK_LIMIT_FIELD, maxChunkLimit));
+        String content = "\n\na\n\n\n";
+        int runtimeMaxChunkLimit = 2;
+        IllegalStateException illegalStateException = assertThrows(
+            IllegalStateException.class,
+            () -> chunker.chunk(content, Map.of(MAX_CHUNK_LIMIT_FIELD, runtimeMaxChunkLimit))
+        );
+        assert (illegalStateException.getMessage()
+            .contains(
+                String.format(
+                    Locale.ROOT,
+                    "The number of chunks produced by %s processor has exceeded the allowed maximum of [%s].",
+                    TYPE,
+                    maxChunkLimit
+                )
+            ));
+    }
 }
diff --git a/...test/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunkerTests.java b/...test/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunkerTests.java
@@ -23,6 +23,8 @@
 import java.util.Map;
 import static java.util.Collections.singletonList;
 import static java.util.Collections.singletonMap;
+import static org.opensearch.neuralsearch.processor.TextChunkingProcessor.TYPE;
+import static org.opensearch.neuralsearch.processor.chunker.Chunker.MAX_CHUNK_LIMIT_FIELD;
 import static org.opensearch.neuralsearch.processor.chunker.FixedTokenLengthChunker.ALGORITHM_NAME;
 import static org.opensearch.neuralsearch.processor.chunker.FixedTokenLengthChunker.ANALYSIS_REGISTRY_FIELD;
 import static org.opensearch.neuralsearch.processor.chunker.FixedTokenLengthChunker.TOKEN_LIMIT_FIELD;
@@ -170,6 +172,18 @@ public void testChunk_whenTokenizationException_thenFail() {
             .contains(String.format(Locale.ROOT, "%s algorithm encounters exception in tokenization", ALGORITHM_NAME)));
     }
 
+    public void testChunk_withEmptyInput_thenSucceed() {
+        Map<String, Object> parameters = new HashMap<>();
+        parameters.put(TOKEN_LIMIT_FIELD, 10);
+        parameters.put(TOKENIZER_FIELD, "standard");
+        FixedTokenLengthChunker fixedTokenLengthChunker = createFixedTokenLengthChunker(parameters);
+        Map<String, Object> runtimeParameters = new HashMap<>();
+        runtimeParameters.put(MAX_TOKEN_COUNT_FIELD, 10000);
+        String content = "";
+        List<String> passages = fixedTokenLengthChunker.chunk(content, runtimeParameters);
+        assert (passages.isEmpty());
+    }
+
     public void testChunk_withTokenLimit10_thenSucceed() {
         Map<String, Object> parameters = new HashMap<>();
         parameters.put(TOKEN_LIMIT_FIELD, 10);
@@ -220,4 +234,76 @@ public void testChunk_withOverlapRateHalf_thenSucceed() {
         expectedPassages.add("sentences and 24 tokens by standard tokenizer in OpenSearch.");
         assertEquals(expectedPassages, passages);
     }
+
+    public void testChunk_whenExceedMaxChunkLimit_thenFail() {
+        int maxChunkLimit = 2;
+        Map<String, Object> parameters = new HashMap<>();
+        parameters.put(TOKEN_LIMIT_FIELD, 10);
+        parameters.put(TOKENIZER_FIELD, "standard");
+        parameters.put(MAX_CHUNK_LIMIT_FIELD, maxChunkLimit);
+        FixedTokenLengthChunker fixedTokenLengthChunker = createFixedTokenLengthChunker(parameters);
+        Map<String, Object> runtimeParameters = new HashMap<>();
+        runtimeParameters.put(MAX_TOKEN_COUNT_FIELD, 10000);
+        String content =
+            "This is an example document to be chunked. The document contains a single paragraph, two sentences and 24 tokens by standard tokenizer in OpenSearch.";
+        IllegalStateException illegalStateException = assertThrows(
+            IllegalStateException.class,
+            () -> fixedTokenLengthChunker.chunk(content, runtimeParameters)
+        );
+        assert (illegalStateException.getMessage()
+            .contains(
+                String.format(
+                    Locale.ROOT,
+                    "The number of chunks produced by %s processor has exceeded the allowed maximum of [%s].",
+                    TYPE,
+                    maxChunkLimit
+                )
+            ));
+    }
+
+    public void testChunk_whenWithinMaxChunkLimit_thenSucceed() {
+        int maxChunkLimit = 3;
+        Map<String, Object> parameters = new HashMap<>();
+        parameters.put(TOKEN_LIMIT_FIELD, 10);
+        parameters.put(TOKENIZER_FIELD, "standard");
+        parameters.put(MAX_CHUNK_LIMIT_FIELD, maxChunkLimit);
+        FixedTokenLengthChunker fixedTokenLengthChunker = createFixedTokenLengthChunker(parameters);
+        Map<String, Object> runtimeParameters = new HashMap<>();
+        runtimeParameters.put(MAX_TOKEN_COUNT_FIELD, 10000);
+        String content =
+            "This is an example document to be chunked. The document contains a single paragraph, two sentences and 24 tokens by standard tokenizer in OpenSearch.";
+        List<String> passages = fixedTokenLengthChunker.chunk(content, runtimeParameters);
+        List<String> expectedPassages = new ArrayList<>();
+        expectedPassages.add("This is an example document to be chunked. The document ");
+        expectedPassages.add("contains a single paragraph, two sentences and 24 tokens by ");
+        expectedPassages.add("standard tokenizer in OpenSearch.");
+        assertEquals(expectedPassages, passages);
+    }
+
+    public void testChunk_whenExceedRuntimeMaxChunkLimit_thenFail() {
+        int maxChunkLimit = 3, runtimeMaxChunkLimit = 2;
+        Map<String, Object> parameters = new HashMap<>();
+        parameters.put(TOKEN_LIMIT_FIELD, 10);
+        parameters.put(TOKENIZER_FIELD, "standard");
+        parameters.put(MAX_CHUNK_LIMIT_FIELD, maxChunkLimit);
+        FixedTokenLengthChunker fixedTokenLengthChunker = createFixedTokenLengthChunker(parameters);
+        Map<String, Object> runtimeParameters = new HashMap<>();
+        runtimeParameters.put(MAX_TOKEN_COUNT_FIELD, 10000);
+        runtimeParameters.put(MAX_CHUNK_LIMIT_FIELD, runtimeMaxChunkLimit);
+        String content =
+            "This is an example document to be chunked. The document contains a single paragraph, two sentences and 24 tokens by standard tokenizer in OpenSearch.";
+        IllegalStateException illegalStateException = assertThrows(
+            IllegalStateException.class,
+            () -> fixedTokenLengthChunker.chunk(content, runtimeParameters)
+        );
+        assert (illegalStateException.getMessage()
+            .contains(
+                String.format(
+                    Locale.ROOT,
+                    "The number of chunks produced by %s processor has exceeded the allowed maximum of [%s].",
+                    TYPE,
+                    maxChunkLimit
+                )
+            ));
+    }
 }