From eec3064a78ac3b84cece120178483ef51b4b41ae Mon Sep 17 00:00:00 2001 From: xinyual Date: Thu, 7 Mar 2024 18:45:49 +0800 Subject: [PATCH 1/6] fix current UT Signed-off-by: xinyual --- .../DocumentChunkingProcessorTests.java | 188 +++++++----------- 1 file changed, 74 insertions(+), 114 deletions(-) diff --git a/src/test/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessorTests.java b/src/test/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessorTests.java index 332819ae7..4b15c25f7 100644 --- a/src/test/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessorTests.java +++ b/src/test/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessorTests.java @@ -4,11 +4,11 @@ */ package org.opensearch.neuralsearch.processor; -import com.google.common.collect.ImmutableMap; import lombok.SneakyThrows; import org.apache.lucene.tests.analysis.MockTokenizer; import org.junit.Before; import org.mockito.Mock; +import org.opensearch.OpenSearchParseException; import org.opensearch.cluster.ClusterState; import org.opensearch.cluster.metadata.Metadata; import org.opensearch.cluster.service.ClusterService; @@ -38,6 +38,7 @@ import static org.mockito.ArgumentMatchers.anyString; import static org.mockito.Mockito.when; import static org.mockito.Mockito.mock; +import static org.opensearch.neuralsearch.processor.DocumentChunkingProcessor.ALGORITHM_FIELD; public class DocumentChunkingProcessorTests extends OpenSearchTestCase { @@ -46,6 +47,8 @@ public class DocumentChunkingProcessorTests extends OpenSearchTestCase { private static final String PROCESSOR_TAG = "mockTag"; private static final String DESCRIPTION = "mockDescription"; private static final String INPUT_FIELD = "body"; + + private static final String INPUT_NESTED_FIELD_KEY = "nested"; private static final String OUTPUT_FIELD = "body_chunk"; private static final String INDEX_NAME = "_index"; @@ -61,11 +64,11 @@ private AnalysisRegistry getAnalysisRegistry() { @Override public Map> getTokenizers() { return singletonMap( - "keyword", - (indexSettings, environment, name, settings) -> TokenizerFactory.newFactory( - name, - () -> new MockTokenizer(MockTokenizer.KEYWORD, false) - ) + "keyword", + (indexSettings, environment, name, settings) -> TokenizerFactory.newFactory( + name, + () -> new MockTokenizer(MockTokenizer.KEYWORD, false) + ) ); } }; @@ -75,6 +78,9 @@ public Map> getTokeniz @Before public void setup() { Metadata metadata = mock(Metadata.class); + Environment environment = mock(Environment.class); + Settings settings = Settings.builder().put("index.mapping.depth.limit", 20).build(); + when(environment.settings()).thenReturn(settings); ClusterState clusterState = mock(ClusterState.class); ClusterService clusterService = mock(ClusterService.class); IndicesService indicesService = mock(IndicesService.class); @@ -96,15 +102,25 @@ private Map createDelimiterParameters() { return parameters; } + private Map createStringFieldMap() { + Map fieldMap = new HashMap<>(); + fieldMap.put(INPUT_FIELD, OUTPUT_FIELD); + return fieldMap; + } + + private Map createNestedFieldMap() { + Map fieldMap = new HashMap<>(); + fieldMap.put(INPUT_NESTED_FIELD_KEY, Map.of(INPUT_FIELD, OUTPUT_FIELD)); + return fieldMap; + } + @SneakyThrows - private DocumentChunkingProcessor createFixedTokenLengthInstance() { + private DocumentChunkingProcessor createFixedTokenLengthInstance(Map fieldMap) { Map config = new HashMap<>(); - Map fieldMap = new HashMap<>(); Map algorithmMap = new HashMap<>(); algorithmMap.put(ChunkerFactory.FIXED_LENGTH_ALGORITHM, createFixedTokenLengthParameters()); - fieldMap.put(INPUT_FIELD, OUTPUT_FIELD); config.put(DocumentChunkingProcessor.FIELD_MAP_FIELD, fieldMap); - config.put(DocumentChunkingProcessor.ALGORITHM_FIELD, algorithmMap); + config.put(ALGORITHM_FIELD, algorithmMap); Map registry = new HashMap<>(); return factory.create(registry, PROCESSOR_TAG, DESCRIPTION, config); } @@ -117,7 +133,7 @@ private DocumentChunkingProcessor createDelimiterInstance() { algorithmMap.put(ChunkerFactory.DELIMITER_ALGORITHM, createDelimiterParameters()); fieldMap.put(INPUT_FIELD, OUTPUT_FIELD); config.put(DocumentChunkingProcessor.FIELD_MAP_FIELD, fieldMap); - config.put(DocumentChunkingProcessor.ALGORITHM_FIELD, algorithmMap); + config.put(ALGORITHM_FIELD, algorithmMap); Map registry = new HashMap<>(); return factory.create(registry, PROCESSOR_TAG, DESCRIPTION, config); } @@ -127,11 +143,11 @@ public void testCreate_whenFieldMapEmpty_failure() { Map emptyFieldMap = new HashMap<>(); config.put(DocumentChunkingProcessor.FIELD_MAP_FIELD, emptyFieldMap); Map registry = new HashMap<>(); - IllegalArgumentException illegalArgumentException = assertThrows( - IllegalArgumentException.class, - () -> factory.create(registry, PROCESSOR_TAG, DESCRIPTION, config) + OpenSearchParseException openSearchParseException = assertThrows( + OpenSearchParseException.class, + () -> factory.create(registry, PROCESSOR_TAG, DESCRIPTION, config) ); - assertEquals("Unable to create the processor as field_map is null or empty", illegalArgumentException.getMessage()); + assertEquals("[" + ALGORITHM_FIELD + "] required property is missing", openSearchParseException.getMessage()); } public void testCreate_whenFieldMapWithEmptyParameter_failure() { @@ -140,11 +156,11 @@ public void testCreate_whenFieldMapWithEmptyParameter_failure() { fieldMap.put("key", null); config.put(DocumentChunkingProcessor.FIELD_MAP_FIELD, fieldMap); Map registry = new HashMap<>(); - IllegalArgumentException illegalArgumentException = assertThrows( - IllegalArgumentException.class, - () -> factory.create(registry, PROCESSOR_TAG, DESCRIPTION, config) + OpenSearchParseException openSearchParseException = assertThrows( + OpenSearchParseException.class, + () -> factory.create(registry, PROCESSOR_TAG, DESCRIPTION, config) ); - assertEquals("parameters for input field [key] is null, cannot process it.", illegalArgumentException.getMessage()); + assertEquals("[" + ALGORITHM_FIELD + "] required property is missing", openSearchParseException.getMessage()); } public void testCreate_whenFieldMapWithIllegalParameterType_failure() { @@ -153,28 +169,11 @@ public void testCreate_whenFieldMapWithIllegalParameterType_failure() { fieldMap.put("key", "value"); config.put(DocumentChunkingProcessor.FIELD_MAP_FIELD, fieldMap); Map registry = new HashMap<>(); - IllegalArgumentException illegalArgumentException = assertThrows( - IllegalArgumentException.class, - () -> factory.create(registry, PROCESSOR_TAG, DESCRIPTION, config) - ); - assertEquals("parameters for input field [key] cannot be cast to [java.util.Map]", illegalArgumentException.getMessage()); - } - - public void testCreate_whenFieldMapWithIllegalKey_failure() { - Map config = new HashMap<>(); - Map fieldMap = new HashMap<>(); - fieldMap.put(null, 1); - fieldMap.put(INPUT_FIELD, OUTPUT_FIELD); - Map algorithmMap = new HashMap<>(); - algorithmMap.put(ChunkerFactory.FIXED_LENGTH_ALGORITHM, createFixedTokenLengthParameters()); - config.put(DocumentChunkingProcessor.FIELD_MAP_FIELD, fieldMap); - config.put(DocumentChunkingProcessor.ALGORITHM_FIELD, algorithmMap); - Map registry = new HashMap<>(); - IllegalArgumentException illegalArgumentException = assertThrows( - IllegalArgumentException.class, - () -> factory.create(registry, PROCESSOR_TAG, DESCRIPTION, config) + OpenSearchParseException openSearchParseException = assertThrows( + OpenSearchParseException.class, + () -> factory.create(registry, PROCESSOR_TAG, DESCRIPTION, config) ); - assertEquals("found parameter entry with non-string key", illegalArgumentException.getMessage()); + assertEquals("[" + ALGORITHM_FIELD + "] required property is missing", openSearchParseException.getMessage()); } public void testCreate_whenFieldMapWithNoAlgorithm_failure() { @@ -183,21 +182,21 @@ public void testCreate_whenFieldMapWithNoAlgorithm_failure() { Map algorithmMap = new HashMap<>(); fieldMap.put(INPUT_FIELD, OUTPUT_FIELD); config.put(DocumentChunkingProcessor.FIELD_MAP_FIELD, fieldMap); - config.put(DocumentChunkingProcessor.ALGORITHM_FIELD, algorithmMap); + config.put(ALGORITHM_FIELD, algorithmMap); Map registry = new HashMap<>(); IllegalArgumentException illegalArgumentException = assertThrows( - IllegalArgumentException.class, - () -> factory.create(registry, PROCESSOR_TAG, DESCRIPTION, config) + IllegalArgumentException.class, + () -> factory.create(registry, PROCESSOR_TAG, DESCRIPTION, config) ); assertEquals( - "input field [" + INPUT_FIELD + "] should has and only has 1 chunking algorithm", - illegalArgumentException.getMessage() + "Unable to create the processor as [" + ALGORITHM_FIELD + "] must contain and only contain 1 algorithm", + illegalArgumentException.getMessage() ); } @SneakyThrows public void testGetType() { - DocumentChunkingProcessor processor = createFixedTokenLengthInstance(); + DocumentChunkingProcessor processor = createFixedTokenLengthInstance(createStringFieldMap()); String type = processor.getType(); assertEquals(DocumentChunkingProcessor.TYPE, type); } @@ -209,10 +208,10 @@ private String createSourceDataString() { private List createSourceDataList() { List documents = new ArrayList<>(); documents.add( - "This is the first document to be chunked. The document contains a single paragraph, two sentences and 24 tokens by standard tokenizer in OpenSearch." + "This is the first document to be chunked. The document contains a single paragraph, two sentences and 24 tokens by standard tokenizer in OpenSearch." ); documents.add( - "This is the second document to be chunked. The document contains a single paragraph, two sentences and 24 tokens by standard tokenizer in OpenSearch." + "This is the second document to be chunked. The document contains a single paragraph, two sentences and 24 tokens by standard tokenizer in OpenSearch." ); return documents; } @@ -220,27 +219,29 @@ private List createSourceDataList() { private Map createSourceDataMap() { Map documents = new HashMap<>(); documents.put( - "third", - "This is the third document to be chunked. The document contains a single paragraph, two sentences and 24 tokens by standard tokenizer in OpenSearch." + "third", + "This is the third document to be chunked. The document contains a single paragraph, two sentences and 24 tokens by standard tokenizer in OpenSearch." ); documents.put( - "fourth", - "This is the fourth document to be chunked. The document contains a single paragraph, two sentences and 24 tokens by standard tokenizer in OpenSearch." + "fourth", + "This is the fourth document to be chunked. The document contains a single paragraph, two sentences and 24 tokens by standard tokenizer in OpenSearch." ); return documents; } private Map createSourceDataNestedMap() { - String documentString = createSourceDataString(); - List documentList = createSourceDataList(); - Map documentMap = createSourceDataMap(); Map documents = new HashMap<>(); - documents.put("String", documentString); - documents.put("List", documentList); - documents.put("Map", documentMap); + documents.put(INPUT_FIELD, createSourceDataString()); return documents; } + private IngestDocument createIngestDocumentWithNestedSourceData(Object sourceData) { + Map sourceAndMetadata = new HashMap<>(); + sourceAndMetadata.put(INPUT_NESTED_FIELD_KEY, sourceData); + sourceAndMetadata.put(IndexFieldMapper.NAME, INDEX_NAME); + return new IngestDocument(sourceAndMetadata, new HashMap<>()); + } + private IngestDocument createIngestDocumentWithSourceData(Object sourceData) { Map sourceAndMetadata = new HashMap<>(); sourceAndMetadata.put(INPUT_FIELD, sourceData); @@ -250,7 +251,7 @@ private IngestDocument createIngestDocumentWithSourceData(Object sourceData) { @SneakyThrows public void testExecute_withFixedTokenLength_andSourceDataString_successful() { - DocumentChunkingProcessor processor = createFixedTokenLengthInstance(); + DocumentChunkingProcessor processor = createFixedTokenLengthInstance(createStringFieldMap()); IngestDocument ingestDocument = createIngestDocumentWithSourceData(createSourceDataString()); IngestDocument document = processor.execute(ingestDocument); assert document.getSourceAndMetadata().containsKey(OUTPUT_FIELD); @@ -265,7 +266,7 @@ public void testExecute_withFixedTokenLength_andSourceDataString_successful() { @SneakyThrows public void testExecute_withFixedTokenLength_andSourceDataList_successful() { - DocumentChunkingProcessor processor = createFixedTokenLengthInstance(); + DocumentChunkingProcessor processor = createFixedTokenLengthInstance(createStringFieldMap()); IngestDocument ingestDocument = createIngestDocumentWithSourceData(createSourceDataList()); IngestDocument document = processor.execute(ingestDocument); assert document.getSourceAndMetadata().containsKey(OUTPUT_FIELD); @@ -283,63 +284,22 @@ public void testExecute_withFixedTokenLength_andSourceDataList_successful() { } @SneakyThrows - public void testExecute_withFixedTokenLength_andSourceDataMap_successful() { - DocumentChunkingProcessor processor = createFixedTokenLengthInstance(); - IngestDocument ingestDocument = createIngestDocumentWithSourceData(createSourceDataMap()); + public void testExecute_withFixedTokenLength_andFieldMapNestedMap_successful() { + DocumentChunkingProcessor processor = createFixedTokenLengthInstance(createNestedFieldMap()); + IngestDocument ingestDocument = createIngestDocumentWithNestedSourceData(createSourceDataNestedMap()); IngestDocument document = processor.execute(ingestDocument); - assert document.getSourceAndMetadata().containsKey(OUTPUT_FIELD); - Object passages = document.getSourceAndMetadata().get(OUTPUT_FIELD); - assert (passages instanceof Map); - - List expectedPassages1 = new ArrayList<>(); - List expectedPassages2 = new ArrayList<>(); - - expectedPassages1.add("This is the third document to be chunked The document"); - expectedPassages1.add("The document contains a single paragraph two sentences and 24"); - expectedPassages1.add("and 24 tokens by standard tokenizer in OpenSearch"); - expectedPassages2.add("This is the fourth document to be chunked The document"); - expectedPassages2.add("The document contains a single paragraph two sentences and 24"); - expectedPassages2.add("and 24 tokens by standard tokenizer in OpenSearch"); + assert document.getSourceAndMetadata().containsKey(INPUT_NESTED_FIELD_KEY); + Object nestedResult = document.getSourceAndMetadata().get(INPUT_NESTED_FIELD_KEY); + assert (nestedResult instanceof Map); + assert ((Map) nestedResult).containsKey(OUTPUT_FIELD); + Object passages = ((Map) nestedResult).get(OUTPUT_FIELD); + assert (passages instanceof List); - Map expectedPassages = ImmutableMap.of("third", expectedPassages1, "fourth", expectedPassages2); - - assertEquals(expectedPassages, passages); - } + List expectedPassages = new ArrayList<>(); - @SneakyThrows - public void testExecute_withFixedTokenLength_andSourceDataNestedMap_successful() { - DocumentChunkingProcessor processor = createFixedTokenLengthInstance(); - IngestDocument ingestDocument = createIngestDocumentWithSourceData(createSourceDataNestedMap()); - IngestDocument document = processor.execute(ingestDocument); - assert document.getSourceAndMetadata().containsKey(OUTPUT_FIELD); - Object passages = document.getSourceAndMetadata().get(OUTPUT_FIELD); - assert (passages instanceof Map); - - Map expectedPassages = new HashMap<>(); - List expectedPassages1 = new ArrayList<>(); - List expectedPassages2 = new ArrayList<>(); - List expectedPassages3 = new ArrayList<>(); - List expectedPassages4 = new ArrayList<>(); - - expectedPassages1.add("This is an example document to be chunked The document"); - expectedPassages1.add("The document contains a single paragraph two sentences and 24"); - expectedPassages1.add("and 24 tokens by standard tokenizer in OpenSearch"); - expectedPassages2.add("This is the first document to be chunked The document"); - expectedPassages2.add("The document contains a single paragraph two sentences and 24"); - expectedPassages2.add("and 24 tokens by standard tokenizer in OpenSearch"); - expectedPassages2.add("This is the second document to be chunked The document"); - expectedPassages2.add("The document contains a single paragraph two sentences and 24"); - expectedPassages2.add("and 24 tokens by standard tokenizer in OpenSearch"); - expectedPassages3.add("This is the third document to be chunked The document"); - expectedPassages3.add("The document contains a single paragraph two sentences and 24"); - expectedPassages3.add("and 24 tokens by standard tokenizer in OpenSearch"); - expectedPassages4.add("This is the fourth document to be chunked The document"); - expectedPassages4.add("The document contains a single paragraph two sentences and 24"); - expectedPassages4.add("and 24 tokens by standard tokenizer in OpenSearch"); - - expectedPassages.put("String", expectedPassages1); - expectedPassages.put("List", expectedPassages2); - expectedPassages.put("Map", ImmutableMap.of("third", expectedPassages3, "fourth", expectedPassages4)); + expectedPassages.add("This is an example document to be chunked The document"); + expectedPassages.add("The document contains a single paragraph two sentences and 24"); + expectedPassages.add("and 24 tokens by standard tokenizer in OpenSearch"); assertEquals(expectedPassages, passages); } From 11984f24759ef29108226a26b42c0e15ed868e19 Mon Sep 17 00:00:00 2001 From: xinyual Date: Thu, 7 Mar 2024 18:46:23 +0800 Subject: [PATCH 2/6] change delimiter UT Signed-off-by: xinyual --- .../chunker/DelimiterChunkerTests.java | 33 ------------------- 1 file changed, 33 deletions(-) diff --git a/src/test/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunkerTests.java b/src/test/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunkerTests.java index b92999fa3..a1fa4185c 100644 --- a/src/test/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunkerTests.java +++ b/src/test/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunkerTests.java @@ -13,34 +13,9 @@ import static junit.framework.TestCase.assertEquals; import static org.junit.Assert.assertThrows; import static org.opensearch.neuralsearch.processor.chunker.DelimiterChunker.DELIMITER_FIELD; -import static org.opensearch.neuralsearch.processor.chunker.DelimiterChunker.MAX_CHUNK_LIMIT_FIELD; public class DelimiterChunkerTests extends OpenSearchTestCase { - public void testChunkerWithWrongLimitFieldList() { - DelimiterChunker chunker = new DelimiterChunker(); - String content = "a\nb\nc\nd"; - Map inputParameters = Map.of(MAX_CHUNK_LIMIT_FIELD, List.of("-1"), DELIMITER_FIELD, "\n"); - Exception exception = assertThrows(IllegalArgumentException.class, () -> chunker.validateParameters(inputParameters)); - Assert.assertEquals("Parameter max_chunk_limit:" + List.of("-1") + " should be integer.", exception.getMessage()); - } - - public void testChunkerWithWrongLimitField() { - DelimiterChunker chunker = new DelimiterChunker(); - String content = "a\nb\nc\nd"; - Map inputParameters = Map.of(MAX_CHUNK_LIMIT_FIELD, "1000\n", DELIMITER_FIELD, "\n"); - Exception exception = assertThrows(IllegalArgumentException.class, () -> chunker.validateParameters(inputParameters)); - Assert.assertEquals("Parameter max_chunk_limit:1000\n should be integer.", exception.getMessage()); - } - - public void testChunkerWithNegativeLimit() { - DelimiterChunker chunker = new DelimiterChunker(); - String content = "a\nb\nc\nd"; - Map inputParameters = Map.of(MAX_CHUNK_LIMIT_FIELD, -1, DELIMITER_FIELD, "\n"); - Exception exception = assertThrows(IllegalArgumentException.class, () -> chunker.validateParameters(inputParameters)); - Assert.assertEquals("Parameter max_chunk_limit:-1 is not greater than 0.", exception.getMessage()); - } - public void testChunkerWithDelimiterFieldNotString() { DelimiterChunker chunker = new DelimiterChunker(); String content = "a\nb\nc\nd"; @@ -57,14 +32,6 @@ public void testChunkerWithDelimiterFieldNoString() { Assert.assertEquals("delimiter parameters should not be empty.", exception.getMessage()); } - public void testChunkerWithLimitNumber() { - DelimiterChunker chunker = new DelimiterChunker(); - String content = "a\nb\nc\nd"; - Map inputParameters = Map.of(DELIMITER_FIELD, "\n", MAX_CHUNK_LIMIT_FIELD, 1); - IllegalStateException exception = assertThrows(IllegalStateException.class, () -> chunker.chunk(content, inputParameters)); - Assert.assertEquals("Exceed max chunk number: 1", exception.getMessage()); - } - public void testChunker() { DelimiterChunker chunker = new DelimiterChunker(); String content = "a\nb\nc\nd"; From 8b03962677b3e67f6b9cf05d967b14b0721d97db Mon Sep 17 00:00:00 2001 From: xinyual Date: Thu, 7 Mar 2024 18:47:07 +0800 Subject: [PATCH 3/6] remove delimiter useless code Signed-off-by: xinyual --- .../processor/chunker/DelimiterChunker.java | 24 ++----------------- 1 file changed, 2 insertions(+), 22 deletions(-) diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java index e99849b53..64dffd1cd 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java @@ -14,10 +14,6 @@ public DelimiterChunker() {} public static String DELIMITER_FIELD = "delimiter"; - public static String MAX_CHUNK_LIMIT_FIELD = "max_chunk_limit"; - - private static final int DEFAULT_MAX_CHUNK_LIMIT = 100; - @Override public void validateParameters(Map parameters) { if (parameters.containsKey(DELIMITER_FIELD)) { @@ -28,41 +24,25 @@ public void validateParameters(Map parameters) { throw new IllegalArgumentException("delimiter parameters should not be empty."); } } - if (parameters.containsKey(MAX_CHUNK_LIMIT_FIELD)) { - Object maxChunkLimit = parameters.get(MAX_CHUNK_LIMIT_FIELD); - if (!(maxChunkLimit instanceof Integer)) { - throw new IllegalArgumentException("Parameter max_chunk_limit:" + maxChunkLimit.toString() + " should be integer."); - } else if ((int) maxChunkLimit < 0) { - throw new IllegalArgumentException("Parameter max_chunk_limit:" + maxChunkLimit + " is negative."); - } - } } @Override public List chunk(String content, Map parameters) { String delimiter = (String) parameters.getOrDefault(DELIMITER_FIELD, "."); - int maxChunkingNumber = (int) parameters.getOrDefault(MAX_CHUNK_LIMIT_FIELD, -1); List chunkResult = new ArrayList<>(); int start = 0; int end = content.indexOf(delimiter); while (end != -1) { - addChunkResult(chunkResult, maxChunkingNumber, content.substring(start, end + delimiter.length())); + chunkResult.add(content.substring(start, end + delimiter.length())); start = end + delimiter.length(); end = content.indexOf(delimiter, start); } if (start < content.length()) { - addChunkResult(chunkResult, maxChunkingNumber, content.substring(start)); + chunkResult.add(content.substring(start)); } return chunkResult; } - - private void addChunkResult(List chunkResult, int maxChunkingNumber, String candidate) { - if (chunkResult.size() >= maxChunkingNumber && maxChunkingNumber > 0) { - throw new IllegalStateException("Exceed max chunk number: " + maxChunkingNumber); - } - chunkResult.add(candidate); - } } From 9de6ea37d2dbb4f4f235c20e0676a3a2e2472f55 Mon Sep 17 00:00:00 2001 From: xinyual Date: Thu, 7 Mar 2024 19:24:35 +0800 Subject: [PATCH 4/6] add more UT Signed-off-by: xinyual --- .../processor/DocumentChunkingProcessor.java | 2 +- .../DocumentChunkingProcessorTests.java | 142 ++++++++++++++---- 2 files changed, 114 insertions(+), 30 deletions(-) diff --git a/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java b/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java index 24c95e3d0..3ce01cdcb 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java @@ -152,7 +152,7 @@ private List chunkString(String content) { "Unable to create the processor as the number of chunks [" + current_chunk_count + "] exceeds the maximum chunk limit [" - + MAX_CHUNK_LIMIT_FIELD + + max_chunk_limit + "]" ); } diff --git a/src/test/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessorTests.java b/src/test/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessorTests.java index 4b15c25f7..dc0337152 100644 --- a/src/test/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessorTests.java +++ b/src/test/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessorTests.java @@ -39,6 +39,7 @@ import static org.mockito.Mockito.when; import static org.mockito.Mockito.mock; import static org.opensearch.neuralsearch.processor.DocumentChunkingProcessor.ALGORITHM_FIELD; +import static org.opensearch.neuralsearch.processor.DocumentChunkingProcessor.MAX_CHUNK_LIMIT_FIELD; public class DocumentChunkingProcessorTests extends OpenSearchTestCase { @@ -64,11 +65,11 @@ private AnalysisRegistry getAnalysisRegistry() { @Override public Map> getTokenizers() { return singletonMap( - "keyword", - (indexSettings, environment, name, settings) -> TokenizerFactory.newFactory( - name, - () -> new MockTokenizer(MockTokenizer.KEYWORD, false) - ) + "keyword", + (indexSettings, environment, name, settings) -> TokenizerFactory.newFactory( + name, + () -> new MockTokenizer(MockTokenizer.KEYWORD, false) + ) ); } }; @@ -96,6 +97,13 @@ private Map createFixedTokenLengthParameters() { return parameters; } + private Map createFixedTokenLengthParametersWithMaxChunk(int maxChunkNum) { + Map parameters = new HashMap<>(); + parameters.put(FixedTokenLengthChunker.TOKEN_LIMIT_FIELD, 10); + parameters.put(MAX_CHUNK_LIMIT_FIELD, maxChunkNum); + return parameters; + } + private Map createDelimiterParameters() { Map parameters = new HashMap<>(); parameters.put(DelimiterChunker.DELIMITER_FIELD, "."); @@ -125,6 +133,17 @@ private DocumentChunkingProcessor createFixedTokenLengthInstance(Map fieldMap, int maxChunkNum) { + Map config = new HashMap<>(); + Map algorithmMap = new HashMap<>(); + algorithmMap.put(ChunkerFactory.FIXED_LENGTH_ALGORITHM, createFixedTokenLengthParametersWithMaxChunk(maxChunkNum)); + config.put(DocumentChunkingProcessor.FIELD_MAP_FIELD, fieldMap); + config.put(ALGORITHM_FIELD, algorithmMap); + Map registry = new HashMap<>(); + return factory.create(registry, PROCESSOR_TAG, DESCRIPTION, config); + } + @SneakyThrows private DocumentChunkingProcessor createDelimiterInstance() { Map config = new HashMap<>(); @@ -144,12 +163,30 @@ public void testCreate_whenFieldMapEmpty_failure() { config.put(DocumentChunkingProcessor.FIELD_MAP_FIELD, emptyFieldMap); Map registry = new HashMap<>(); OpenSearchParseException openSearchParseException = assertThrows( - OpenSearchParseException.class, - () -> factory.create(registry, PROCESSOR_TAG, DESCRIPTION, config) + OpenSearchParseException.class, + () -> factory.create(registry, PROCESSOR_TAG, DESCRIPTION, config) ); assertEquals("[" + ALGORITHM_FIELD + "] required property is missing", openSearchParseException.getMessage()); } + @SneakyThrows + public void testCreate_whenMaxChunkNumNegative() { + Map registry = new HashMap<>(); + Map config = new HashMap<>(); + Map fieldMap = new HashMap<>(); + Map algorithmMap = new HashMap<>(); + fieldMap.put(INPUT_FIELD, OUTPUT_FIELD); + algorithmMap.put(ChunkerFactory.FIXED_LENGTH_ALGORITHM, createFixedTokenLengthParametersWithMaxChunk(-1)); + config.put(DocumentChunkingProcessor.FIELD_MAP_FIELD, fieldMap); + config.put(ALGORITHM_FIELD, algorithmMap); + IllegalArgumentException illegalArgumentException = assertThrows( + IllegalArgumentException.class, + () -> factory.create(registry, PROCESSOR_TAG, DESCRIPTION, config) + ); + assertEquals("Parameter [" + MAX_CHUNK_LIMIT_FIELD + "] must be a positive integer", illegalArgumentException.getMessage()); + + } + public void testCreate_whenFieldMapWithEmptyParameter_failure() { Map config = new HashMap<>(); Map fieldMap = new HashMap<>(); @@ -157,8 +194,8 @@ public void testCreate_whenFieldMapWithEmptyParameter_failure() { config.put(DocumentChunkingProcessor.FIELD_MAP_FIELD, fieldMap); Map registry = new HashMap<>(); OpenSearchParseException openSearchParseException = assertThrows( - OpenSearchParseException.class, - () -> factory.create(registry, PROCESSOR_TAG, DESCRIPTION, config) + OpenSearchParseException.class, + () -> factory.create(registry, PROCESSOR_TAG, DESCRIPTION, config) ); assertEquals("[" + ALGORITHM_FIELD + "] required property is missing", openSearchParseException.getMessage()); } @@ -170,8 +207,8 @@ public void testCreate_whenFieldMapWithIllegalParameterType_failure() { config.put(DocumentChunkingProcessor.FIELD_MAP_FIELD, fieldMap); Map registry = new HashMap<>(); OpenSearchParseException openSearchParseException = assertThrows( - OpenSearchParseException.class, - () -> factory.create(registry, PROCESSOR_TAG, DESCRIPTION, config) + OpenSearchParseException.class, + () -> factory.create(registry, PROCESSOR_TAG, DESCRIPTION, config) ); assertEquals("[" + ALGORITHM_FIELD + "] required property is missing", openSearchParseException.getMessage()); } @@ -185,12 +222,12 @@ public void testCreate_whenFieldMapWithNoAlgorithm_failure() { config.put(ALGORITHM_FIELD, algorithmMap); Map registry = new HashMap<>(); IllegalArgumentException illegalArgumentException = assertThrows( - IllegalArgumentException.class, - () -> factory.create(registry, PROCESSOR_TAG, DESCRIPTION, config) + IllegalArgumentException.class, + () -> factory.create(registry, PROCESSOR_TAG, DESCRIPTION, config) ); assertEquals( - "Unable to create the processor as [" + ALGORITHM_FIELD + "] must contain and only contain 1 algorithm", - illegalArgumentException.getMessage() + "Unable to create the processor as [" + ALGORITHM_FIELD + "] must contain and only contain 1 algorithm", + illegalArgumentException.getMessage() ); } @@ -208,10 +245,10 @@ private String createSourceDataString() { private List createSourceDataList() { List documents = new ArrayList<>(); documents.add( - "This is the first document to be chunked. The document contains a single paragraph, two sentences and 24 tokens by standard tokenizer in OpenSearch." + "This is the first document to be chunked. The document contains a single paragraph, two sentences and 24 tokens by standard tokenizer in OpenSearch." ); documents.add( - "This is the second document to be chunked. The document contains a single paragraph, two sentences and 24 tokens by standard tokenizer in OpenSearch." + "This is the second document to be chunked. The document contains a single paragraph, two sentences and 24 tokens by standard tokenizer in OpenSearch." ); return documents; } @@ -219,12 +256,12 @@ private List createSourceDataList() { private Map createSourceDataMap() { Map documents = new HashMap<>(); documents.put( - "third", - "This is the third document to be chunked. The document contains a single paragraph, two sentences and 24 tokens by standard tokenizer in OpenSearch." + "third", + "This is the third document to be chunked. The document contains a single paragraph, two sentences and 24 tokens by standard tokenizer in OpenSearch." ); documents.put( - "fourth", - "This is the fourth document to be chunked. The document contains a single paragraph, two sentences and 24 tokens by standard tokenizer in OpenSearch." + "fourth", + "This is the fourth document to be chunked. The document contains a single paragraph, two sentences and 24 tokens by standard tokenizer in OpenSearch." ); return documents; } @@ -249,6 +286,53 @@ private IngestDocument createIngestDocumentWithSourceData(Object sourceData) { return new IngestDocument(sourceAndMetadata, new HashMap<>()); } + @SneakyThrows + public void testExecute_withFixedTokenLength_andSourceDataStringWithMaxChunkNum_successful() { + DocumentChunkingProcessor processor = createFixedTokenLengthInstanceWithMaxChunkNum(createStringFieldMap(), 5); + IngestDocument ingestDocument = createIngestDocumentWithSourceData(createSourceDataString()); + IngestDocument document = processor.execute(ingestDocument); + assert document.getSourceAndMetadata().containsKey(OUTPUT_FIELD); + Object passages = document.getSourceAndMetadata().get(OUTPUT_FIELD); + assert (passages instanceof List); + List expectedPassages = new ArrayList<>(); + expectedPassages.add("This is an example document to be chunked The document"); + expectedPassages.add("contains a single paragraph two sentences and 24 tokens by"); + expectedPassages.add("standard tokenizer in OpenSearch"); + assertEquals(expectedPassages, passages); + } + + @SneakyThrows + public void testExecute_withFixedTokenLength_andSourceDataStringWithMaxChunkNumTwice_successful() { + DocumentChunkingProcessor processor = createFixedTokenLengthInstanceWithMaxChunkNum(createStringFieldMap(), 5); + for (int i = 0; i < 2; i++) { + IngestDocument ingestDocument = createIngestDocumentWithSourceData(createSourceDataString()); + IngestDocument document = processor.execute(ingestDocument); + assert document.getSourceAndMetadata().containsKey(OUTPUT_FIELD); + Object passages = document.getSourceAndMetadata().get(OUTPUT_FIELD); + assert (passages instanceof List); + List expectedPassages = new ArrayList<>(); + expectedPassages.add("This is an example document to be chunked The document"); + expectedPassages.add("contains a single paragraph two sentences and 24 tokens by"); + expectedPassages.add("standard tokenizer in OpenSearch"); + assertEquals(expectedPassages, passages); + } + } + + @SneakyThrows + public void testExecute_withFixedTokenLength_andSourceDataStringWithMaxChunkNum_Exceed() { + DocumentChunkingProcessor processor = createFixedTokenLengthInstanceWithMaxChunkNum(createStringFieldMap(), 1); + IngestDocument ingestDocument = createIngestDocumentWithSourceData(createSourceDataString()); + IllegalArgumentException illegalArgumentException = assertThrows( + IllegalArgumentException.class, + () -> processor.execute(ingestDocument) + ); + assertEquals( + illegalArgumentException.getMessage(), + "Unable to create the processor as the number of chunks [" + "3" + "] exceeds the maximum chunk limit [" + "1" + "]" + ); + + } + @SneakyThrows public void testExecute_withFixedTokenLength_andSourceDataString_successful() { DocumentChunkingProcessor processor = createFixedTokenLengthInstance(createStringFieldMap()); @@ -259,8 +343,8 @@ public void testExecute_withFixedTokenLength_andSourceDataString_successful() { assert (passages instanceof List); List expectedPassages = new ArrayList<>(); expectedPassages.add("This is an example document to be chunked The document"); - expectedPassages.add("The document contains a single paragraph two sentences and 24"); - expectedPassages.add("and 24 tokens by standard tokenizer in OpenSearch"); + expectedPassages.add("contains a single paragraph two sentences and 24 tokens by"); + expectedPassages.add("standard tokenizer in OpenSearch"); assertEquals(expectedPassages, passages); } @@ -275,11 +359,11 @@ public void testExecute_withFixedTokenLength_andSourceDataList_successful() { List expectedPassages = new ArrayList<>(); expectedPassages.add("This is the first document to be chunked The document"); - expectedPassages.add("The document contains a single paragraph two sentences and 24"); - expectedPassages.add("and 24 tokens by standard tokenizer in OpenSearch"); + expectedPassages.add("contains a single paragraph two sentences and 24 tokens by"); + expectedPassages.add("standard tokenizer in OpenSearch"); expectedPassages.add("This is the second document to be chunked The document"); - expectedPassages.add("The document contains a single paragraph two sentences and 24"); - expectedPassages.add("and 24 tokens by standard tokenizer in OpenSearch"); + expectedPassages.add("contains a single paragraph two sentences and 24 tokens by"); + expectedPassages.add("standard tokenizer in OpenSearch"); assertEquals(expectedPassages, passages); } @@ -298,8 +382,8 @@ public void testExecute_withFixedTokenLength_andFieldMapNestedMap_successful() { List expectedPassages = new ArrayList<>(); expectedPassages.add("This is an example document to be chunked The document"); - expectedPassages.add("The document contains a single paragraph two sentences and 24"); - expectedPassages.add("and 24 tokens by standard tokenizer in OpenSearch"); + expectedPassages.add("contains a single paragraph two sentences and 24 tokens by"); + expectedPassages.add("standard tokenizer in OpenSearch"); assertEquals(expectedPassages, passages); } From 36d6816266e4ced939ae227fbf11e0e02a027e2c Mon Sep 17 00:00:00 2001 From: xinyual Date: Thu, 7 Mar 2024 19:25:50 +0800 Subject: [PATCH 5/6] add UT for list inside map Signed-off-by: xinyual --- .../DocumentChunkingProcessorTests.java | 23 +++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/src/test/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessorTests.java b/src/test/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessorTests.java index dc0337152..3b2e7eba4 100644 --- a/src/test/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessorTests.java +++ b/src/test/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessorTests.java @@ -388,6 +388,29 @@ public void testExecute_withFixedTokenLength_andFieldMapNestedMap_successful() { assertEquals(expectedPassages, passages); } + @SneakyThrows + public void testExecute_withFixedTokenLength_andFieldMapNestedMap_sourceList_successful() { + DocumentChunkingProcessor processor = createFixedTokenLengthInstance(createNestedFieldMap()); + IngestDocument ingestDocument = createIngestDocumentWithNestedSourceData(createSourceDataListNestedMap()); + IngestDocument document = processor.execute(ingestDocument); + assert document.getSourceAndMetadata().containsKey(INPUT_NESTED_FIELD_KEY); + Object nestedResult = document.getSourceAndMetadata().get(INPUT_NESTED_FIELD_KEY); + List expectedPassages = new ArrayList<>(); + + expectedPassages.add("This is an example document to be chunked The document"); + expectedPassages.add("The document contains a single paragraph two sentences and 24"); + expectedPassages.add("and 24 tokens by standard tokenizer in OpenSearch"); + assert (nestedResult instanceof List); + assertEquals(((List) nestedResult).size(), 2); + for (Object result : (List) nestedResult) { + assert (result instanceof Map); + assert ((Map) result).containsKey(OUTPUT_FIELD); + Object passages = ((Map) result).get(OUTPUT_FIELD); + assert (passages instanceof List); + assertEquals(expectedPassages, passages); + } + } + @SneakyThrows public void testExecute_withDelimiter_andSourceDataString_successful() { DocumentChunkingProcessor processor = createDelimiterInstance(); From b2e8f36cfd7f1417aa33bd4711ccfc2e6240cb3d Mon Sep 17 00:00:00 2001 From: xinyual Date: Thu, 7 Mar 2024 19:27:38 +0800 Subject: [PATCH 6/6] add UT for list inside map Signed-off-by: xinyual --- .../processor/DocumentChunkingProcessorTests.java | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/src/test/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessorTests.java b/src/test/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessorTests.java index 3b2e7eba4..ae4ff17de 100644 --- a/src/test/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessorTests.java +++ b/src/test/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessorTests.java @@ -97,6 +97,12 @@ private Map createFixedTokenLengthParameters() { return parameters; } + private List> createSourceDataListNestedMap() { + Map documents = new HashMap<>(); + documents.put(INPUT_FIELD, createSourceDataString()); + return List.of(documents, documents); + } + private Map createFixedTokenLengthParametersWithMaxChunk(int maxChunkNum) { Map parameters = new HashMap<>(); parameters.put(FixedTokenLengthChunker.TOKEN_LIMIT_FIELD, 10); @@ -398,8 +404,8 @@ public void testExecute_withFixedTokenLength_andFieldMapNestedMap_sourceList_suc List expectedPassages = new ArrayList<>(); expectedPassages.add("This is an example document to be chunked The document"); - expectedPassages.add("The document contains a single paragraph two sentences and 24"); - expectedPassages.add("and 24 tokens by standard tokenizer in OpenSearch"); + expectedPassages.add("contains a single paragraph two sentences and 24 tokens by"); + expectedPassages.add("standard tokenizer in OpenSearch"); assert (nestedResult instanceof List); assertEquals(((List) nestedResult).size(), 2); for (Object result : (List) nestedResult) {