diff --git a/src/main/java/org/opensearch/neuralsearch/plugin/NeuralSearch.java b/src/main/java/org/opensearch/neuralsearch/plugin/NeuralSearch.java index 80fcf90f4..8f60a6ff8 100644 --- a/src/main/java/org/opensearch/neuralsearch/plugin/NeuralSearch.java +++ b/src/main/java/org/opensearch/neuralsearch/plugin/NeuralSearch.java @@ -35,6 +35,7 @@ import org.opensearch.neuralsearch.processor.TextImageEmbeddingProcessor; import org.opensearch.neuralsearch.processor.combination.ScoreCombinationFactory; import org.opensearch.neuralsearch.processor.combination.ScoreCombiner; +import org.opensearch.neuralsearch.processor.factory.DocumentChunkingProcessorFactory; import org.opensearch.neuralsearch.processor.factory.NormalizationProcessorFactory; import org.opensearch.neuralsearch.processor.factory.RerankProcessorFactory; import org.opensearch.neuralsearch.processor.factory.SparseEncodingProcessorFactory; @@ -117,7 +118,7 @@ public Map getProcessors(Processor.Parameters paramet TextImageEmbeddingProcessor.TYPE, new TextImageEmbeddingProcessorFactory(clientAccessor, parameters.env, parameters.ingestService.getClusterService()), DocumentChunkingProcessor.TYPE, - new DocumentChunkingProcessor.Factory( + new DocumentChunkingProcessorFactory( parameters.env, parameters.ingestService.getClusterService(), parameters.indicesService, diff --git a/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java b/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java index 81d7d2b17..0f8b70b30 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java @@ -21,12 +21,10 @@ import org.opensearch.index.IndexSettings; import org.opensearch.ingest.AbstractProcessor; import org.opensearch.ingest.IngestDocument; -import org.opensearch.ingest.Processor; import org.opensearch.neuralsearch.processor.chunker.ChunkerFactory; import org.opensearch.neuralsearch.processor.chunker.FieldChunker; import org.opensearch.index.mapper.IndexFieldMapper; import org.opensearch.neuralsearch.processor.chunker.FixedTokenLengthChunker; -import static org.opensearch.ingest.ConfigurationUtils.readMap; import static org.opensearch.neuralsearch.processor.chunker.ChunkerFactory.DELIMITER_ALGORITHM; import static org.opensearch.neuralsearch.processor.chunker.ChunkerFactory.FIXED_LENGTH_ALGORITHM; @@ -47,9 +45,9 @@ public final class DocumentChunkingProcessor extends AbstractProcessor { private static final int DEFAULT_MAX_CHUNK_LIMIT = -1; - private int current_chunk_count = 0; + private int currentChunkCount = 0; - private int max_chunk_limit = DEFAULT_MAX_CHUNK_LIMIT; + private int maxChunkLimit = DEFAULT_MAX_CHUNK_LIMIT; private final Set supportedChunkers = ChunkerFactory.getAllChunkers(); private String chunkerType; @@ -122,11 +120,17 @@ private void validateAndParseAlgorithmMap(Map algorithmMap) { this.chunkerParameters = (Map) algorithmValue; chunker.validateParameters(chunkerParameters); if (((Map) algorithmValue).containsKey(MAX_CHUNK_LIMIT_FIELD)) { - int max_chunk_limit = ((Number) ((Map) algorithmValue).get(MAX_CHUNK_LIMIT_FIELD)).intValue(); - if (max_chunk_limit <= 0) { + Object maxChunkLimitObject = ((Map) algorithmValue).get(MAX_CHUNK_LIMIT_FIELD); + if (!(maxChunkLimitObject instanceof Number)) { + throw new IllegalArgumentException( + "Parameter [" + MAX_CHUNK_LIMIT_FIELD + "] cannot be cast to [" + Number.class.getName() + "]" + ); + } + int maxChunkLimit = ((Number) maxChunkLimitObject).intValue(); + if (maxChunkLimit <= 0 && maxChunkLimit != DEFAULT_MAX_CHUNK_LIMIT) { throw new IllegalArgumentException("Parameter [" + MAX_CHUNK_LIMIT_FIELD + "] must be a positive integer"); } - this.max_chunk_limit = max_chunk_limit; + this.maxChunkLimit = maxChunkLimit; } } } @@ -148,13 +152,13 @@ private boolean isListString(Object value) { private List chunkString(String content) { FieldChunker chunker = ChunkerFactory.create(chunkerType, analysisRegistry); List result = chunker.chunk(content, chunkerParameters); - current_chunk_count += result.size(); - if (max_chunk_limit != DEFAULT_MAX_CHUNK_LIMIT && current_chunk_count > max_chunk_limit) { + currentChunkCount += result.size(); + if (maxChunkLimit != DEFAULT_MAX_CHUNK_LIMIT && currentChunkCount > maxChunkLimit) { throw new IllegalArgumentException( "Unable to create the processor as the number of chunks [" - + current_chunk_count + + currentChunkCount + "] exceeds the maximum chunk limit [" - + max_chunk_limit + + maxChunkLimit + "]" ); } @@ -189,7 +193,7 @@ private List chunkLeafType(Object value) { @Override public IngestDocument execute(IngestDocument ingestDocument) { validateFieldsValue(ingestDocument); - current_chunk_count = 0; + currentChunkCount = 0; if (Objects.equals(chunkerType, FIXED_LENGTH_ALGORITHM)) { // add maxTokenCount setting from index metadata to chunker parameters Map sourceAndMetadataMap = ingestDocument.getSourceAndMetadata(); @@ -283,51 +287,4 @@ private void chunkMapType(Map sourceAndMetadataMap, Map registry, - String processorTag, - String description, - Map config - ) throws Exception { - Map fieldMap = readMap(TYPE, processorTag, config, FIELD_MAP_FIELD); - Map algorithmMap = readMap(TYPE, processorTag, config, ALGORITHM_FIELD); - return new DocumentChunkingProcessor( - processorTag, - description, - fieldMap, - algorithmMap, - environment, - clusterService, - indicesService, - analysisRegistry - ); - } - } } diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java index 787e94cd7..8fa8ec088 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java @@ -59,6 +59,5 @@ public List chunk(String content, Map parameters) { chunkResult.add(content.substring(start)); } return chunkResult; - } } diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java index fe08ee074..b58a9c157 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java @@ -122,7 +122,7 @@ public void validateParameters(Map parameters) { public List chunk(String content, Map parameters) { // prior to chunking, parameters have been validated int tokenLimit = DEFAULT_TOKEN_LIMIT; - BigDecimal overlap_rate = new BigDecimal(String.valueOf(DEFAULT_OVERLAP_RATE)); + BigDecimal overlap_rate = DEFAULT_OVERLAP_RATE; int maxTokenCount = DEFAULT_MAX_TOKEN_COUNT; String tokenizer = DEFAULT_TOKENIZER; @@ -148,7 +148,6 @@ public List chunk(String content, Map parameters) { BigDecimal overlapTokenNumberBigDecimal = overlap_rate.multiply(new BigDecimal(String.valueOf(tokenLimit))) .setScale(0, RoundingMode.DOWN); int overlapTokenNumber = overlapTokenNumberBigDecimal.intValue(); - ; // overlapTokenNumber must be smaller than the token limit overlapTokenNumber = Math.min(overlapTokenNumber, tokenLimit - 1); diff --git a/src/main/java/org/opensearch/neuralsearch/processor/factory/DocumentChunkingProcessorFactory.java b/src/main/java/org/opensearch/neuralsearch/processor/factory/DocumentChunkingProcessorFactory.java new file mode 100644 index 000000000..9fa38b48a --- /dev/null +++ b/src/main/java/org/opensearch/neuralsearch/processor/factory/DocumentChunkingProcessorFactory.java @@ -0,0 +1,66 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ +package org.opensearch.neuralsearch.processor.factory; + +import java.util.Map; + +import org.opensearch.cluster.service.ClusterService; +import org.opensearch.env.Environment; +import org.opensearch.index.analysis.AnalysisRegistry; +import org.opensearch.indices.IndicesService; +import org.opensearch.ingest.Processor; +import org.opensearch.neuralsearch.processor.DocumentChunkingProcessor; +import static org.opensearch.neuralsearch.processor.DocumentChunkingProcessor.TYPE; +import static org.opensearch.neuralsearch.processor.DocumentChunkingProcessor.FIELD_MAP_FIELD; +import static org.opensearch.neuralsearch.processor.DocumentChunkingProcessor.ALGORITHM_FIELD; +import static org.opensearch.ingest.ConfigurationUtils.readMap; + +/** + * Factory for chunking ingest processor for ingestion pipeline. + * Instantiates processor based on user provided input. + */ +public class DocumentChunkingProcessorFactory implements Processor.Factory { + + private final Environment environment; + + private final ClusterService clusterService; + + private final IndicesService indicesService; + + private final AnalysisRegistry analysisRegistry; + + public DocumentChunkingProcessorFactory( + Environment environment, + ClusterService clusterService, + IndicesService indicesService, + AnalysisRegistry analysisRegistry + ) { + this.environment = environment; + this.clusterService = clusterService; + this.indicesService = indicesService; + this.analysisRegistry = analysisRegistry; + } + + @Override + public DocumentChunkingProcessor create( + Map registry, + String processorTag, + String description, + Map config + ) throws Exception { + Map fieldMap = readMap(TYPE, processorTag, config, FIELD_MAP_FIELD); + Map algorithmMap = readMap(TYPE, processorTag, config, ALGORITHM_FIELD); + return new DocumentChunkingProcessor( + processorTag, + description, + fieldMap, + algorithmMap, + environment, + clusterService, + indicesService, + analysisRegistry + ); + } +} diff --git a/src/test/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessorTests.java b/src/test/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessorTests.java index c02c76256..b444e19b7 100644 --- a/src/test/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessorTests.java +++ b/src/test/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessorTests.java @@ -34,6 +34,7 @@ import org.opensearch.neuralsearch.processor.chunker.ChunkerFactory; import org.opensearch.neuralsearch.processor.chunker.DelimiterChunker; import org.opensearch.neuralsearch.processor.chunker.FixedTokenLengthChunker; +import org.opensearch.neuralsearch.processor.factory.DocumentChunkingProcessorFactory; import org.opensearch.plugins.AnalysisPlugin; import org.opensearch.test.OpenSearchTestCase; import static org.opensearch.neuralsearch.processor.DocumentChunkingProcessor.FIELD_MAP_FIELD; @@ -42,12 +43,11 @@ public class DocumentChunkingProcessorTests extends OpenSearchTestCase { - private DocumentChunkingProcessor.Factory factory; + private DocumentChunkingProcessorFactory documentChunkingProcessorFactory; private static final String PROCESSOR_TAG = "mockTag"; private static final String DESCRIPTION = "mockDescription"; private static final String INPUT_FIELD = "body"; - private static final String INPUT_NESTED_FIELD_KEY = "nested"; private static final String OUTPUT_FIELD = "body_chunk"; private static final String INDEX_NAME = "_index"; @@ -84,7 +84,12 @@ public void setup() { when(metadata.index(anyString())).thenReturn(null); when(clusterState.metadata()).thenReturn(metadata); when(clusterService.state()).thenReturn(clusterState); - factory = new DocumentChunkingProcessor.Factory(environment, clusterService, indicesService, getAnalysisRegistry()); + documentChunkingProcessorFactory = new DocumentChunkingProcessorFactory( + environment, + clusterService, + indicesService, + getAnalysisRegistry() + ); } private Map createFixedTokenLengthParameters() { @@ -132,7 +137,7 @@ private DocumentChunkingProcessor createFixedTokenLengthInstance(Map registry = new HashMap<>(); - return factory.create(registry, PROCESSOR_TAG, DESCRIPTION, config); + return documentChunkingProcessorFactory.create(registry, PROCESSOR_TAG, DESCRIPTION, config); } @SneakyThrows @@ -143,7 +148,7 @@ private DocumentChunkingProcessor createFixedTokenLengthInstanceWithMaxChunkNum( config.put(FIELD_MAP_FIELD, fieldMap); config.put(ALGORITHM_FIELD, algorithmMap); Map registry = new HashMap<>(); - return factory.create(registry, PROCESSOR_TAG, DESCRIPTION, config); + return documentChunkingProcessorFactory.create(registry, PROCESSOR_TAG, DESCRIPTION, config); } @SneakyThrows @@ -156,40 +161,40 @@ private DocumentChunkingProcessor createDelimiterInstance() { config.put(FIELD_MAP_FIELD, fieldMap); config.put(ALGORITHM_FIELD, algorithmMap); Map registry = new HashMap<>(); - return factory.create(registry, PROCESSOR_TAG, DESCRIPTION, config); + return documentChunkingProcessorFactory.create(registry, PROCESSOR_TAG, DESCRIPTION, config); } - public void testCreate_whenAlgorithmFieldMissing_failure() { + public void testCreate_whenAlgorithmFieldMissing_thenFail() { Map config = new HashMap<>(); Map fieldMap = new HashMap<>(); config.put(FIELD_MAP_FIELD, fieldMap); Map registry = new HashMap<>(); OpenSearchParseException openSearchParseException = assertThrows( OpenSearchParseException.class, - () -> factory.create(registry, PROCESSOR_TAG, DESCRIPTION, config) + () -> documentChunkingProcessorFactory.create(registry, PROCESSOR_TAG, DESCRIPTION, config) ); assertEquals("[" + ALGORITHM_FIELD + "] required property is missing", openSearchParseException.getMessage()); } @SneakyThrows - public void testCreate_whenMaxChunkNumNegative() { + public void testCreate_whenMaxChunkNumInvalidValue_thenFail() { Map registry = new HashMap<>(); Map config = new HashMap<>(); Map fieldMap = new HashMap<>(); Map algorithmMap = new HashMap<>(); fieldMap.put(INPUT_FIELD, OUTPUT_FIELD); - algorithmMap.put(ChunkerFactory.FIXED_LENGTH_ALGORITHM, createFixedTokenLengthParametersWithMaxChunk(-1)); + algorithmMap.put(ChunkerFactory.FIXED_LENGTH_ALGORITHM, createFixedTokenLengthParametersWithMaxChunk(-2)); config.put(FIELD_MAP_FIELD, fieldMap); config.put(ALGORITHM_FIELD, algorithmMap); IllegalArgumentException illegalArgumentException = assertThrows( IllegalArgumentException.class, - () -> factory.create(registry, PROCESSOR_TAG, DESCRIPTION, config) + () -> documentChunkingProcessorFactory.create(registry, PROCESSOR_TAG, DESCRIPTION, config) ); assertEquals("Parameter [" + MAX_CHUNK_LIMIT_FIELD + "] must be a positive integer", illegalArgumentException.getMessage()); } - public void testCreate_whenAlgorithmFieldNoAlgorithm_failure() { + public void testCreate_whenAlgorithmFieldNoAlgorithm_thenFail() { Map config = new HashMap<>(); Map fieldMap = new HashMap<>(); Map algorithmMap = new HashMap<>(); @@ -199,7 +204,7 @@ public void testCreate_whenAlgorithmFieldNoAlgorithm_failure() { Map registry = new HashMap<>(); IllegalArgumentException illegalArgumentException = assertThrows( IllegalArgumentException.class, - () -> factory.create(registry, PROCESSOR_TAG, DESCRIPTION, config) + () -> documentChunkingProcessorFactory.create(registry, PROCESSOR_TAG, DESCRIPTION, config) ); assertEquals( "Unable to create the processor as [" + ALGORITHM_FIELD + "] must contain and only contain 1 algorithm", @@ -207,7 +212,7 @@ public void testCreate_whenAlgorithmFieldNoAlgorithm_failure() { ); } - public void testCreate_whenAlgorithmFieldMultipleAlgorithm_failure() { + public void testCreate_whenAlgorithmFieldMultipleAlgorithm_thenFail() { Map config = new HashMap<>(); Map fieldMap = new HashMap<>(); Map algorithmMap = new HashMap<>(); @@ -219,7 +224,7 @@ public void testCreate_whenAlgorithmFieldMultipleAlgorithm_failure() { Map registry = new HashMap<>(); IllegalArgumentException illegalArgumentException = assertThrows( IllegalArgumentException.class, - () -> factory.create(registry, PROCESSOR_TAG, DESCRIPTION, config) + () -> documentChunkingProcessorFactory.create(registry, PROCESSOR_TAG, DESCRIPTION, config) ); assertEquals( "Unable to create the processor as [" + ALGORITHM_FIELD + "] must contain and only contain 1 algorithm", @@ -227,7 +232,7 @@ public void testCreate_whenAlgorithmFieldMultipleAlgorithm_failure() { ); } - public void testCreate_whenAlgorithmFieldInvalidAlgorithmName_failure() { + public void testCreate_whenAlgorithmFieldInvalidAlgorithmName_thenFail() { Map config = new HashMap<>(); Map fieldMap = new HashMap<>(); Map algorithmMap = new HashMap<>(); @@ -239,13 +244,13 @@ public void testCreate_whenAlgorithmFieldInvalidAlgorithmName_failure() { Map registry = new HashMap<>(); IllegalArgumentException illegalArgumentException = assertThrows( IllegalArgumentException.class, - () -> factory.create(registry, PROCESSOR_TAG, DESCRIPTION, config) + () -> documentChunkingProcessorFactory.create(registry, PROCESSOR_TAG, DESCRIPTION, config) ); assert (illegalArgumentException.getMessage() .contains("Unable to create the processor as chunker algorithm [" + invalid_algorithm_type + "] is not supported")); } - public void testCreate_whenAlgorithmFieldInvalidAlgorithmContent_failure() { + public void testCreate_whenAlgorithmFieldInvalidAlgorithmContent_thenFail() { Map config = new HashMap<>(); Map fieldMap = new HashMap<>(); Map algorithmMap = new HashMap<>(); @@ -256,7 +261,7 @@ public void testCreate_whenAlgorithmFieldInvalidAlgorithmContent_failure() { Map registry = new HashMap<>(); IllegalArgumentException illegalArgumentException = assertThrows( IllegalArgumentException.class, - () -> factory.create(registry, PROCESSOR_TAG, DESCRIPTION, config) + () -> documentChunkingProcessorFactory.create(registry, PROCESSOR_TAG, DESCRIPTION, config) ); assertEquals( "Unable to create the processor as [" @@ -347,7 +352,7 @@ private IngestDocument createIngestDocumentWithSourceData(Object sourceData) { } @SneakyThrows - public void testExecute_withFixedTokenLength_andSourceDataStringWithMaxChunkNum_successful() { + public void testExecute_withFixedTokenLength_andSourceDataStringWithMaxChunkNum_thenSucceed() { DocumentChunkingProcessor processor = createFixedTokenLengthInstanceWithMaxChunkNum(createStringFieldMap(), 5); IngestDocument ingestDocument = createIngestDocumentWithSourceData(createSourceDataString()); IngestDocument document = processor.execute(ingestDocument); @@ -362,7 +367,7 @@ public void testExecute_withFixedTokenLength_andSourceDataStringWithMaxChunkNum_ } @SneakyThrows - public void testExecute_withFixedTokenLength_andSourceDataStringWithMaxChunkNumTwice_successful() { + public void testExecute_withFixedTokenLength_andSourceDataStringWithMaxChunkNumTwice_thenSucceed() { DocumentChunkingProcessor processor = createFixedTokenLengthInstanceWithMaxChunkNum(createStringFieldMap(), 5); for (int i = 0; i < 2; i++) { IngestDocument ingestDocument = createIngestDocumentWithSourceData(createSourceDataString()); @@ -379,7 +384,7 @@ public void testExecute_withFixedTokenLength_andSourceDataStringWithMaxChunkNumT } @SneakyThrows - public void testExecute_withFixedTokenLength_andSourceDataStringWithMaxChunkNum_Exceed() { + public void testExecute_withFixedTokenLength_andSourceDataStringWithMaxChunkNumExceed_thenFail() { DocumentChunkingProcessor processor = createFixedTokenLengthInstanceWithMaxChunkNum(createStringFieldMap(), 1); IngestDocument ingestDocument = createIngestDocumentWithSourceData(createSourceDataString()); IllegalArgumentException illegalArgumentException = assertThrows( @@ -394,7 +399,7 @@ public void testExecute_withFixedTokenLength_andSourceDataStringWithMaxChunkNum_ } @SneakyThrows - public void testExecute_withFixedTokenLength_andSourceDataString_successful() { + public void testExecute_withFixedTokenLength_andSourceDataString_thenSucceed() { DocumentChunkingProcessor processor = createFixedTokenLengthInstance(createStringFieldMap()); IngestDocument ingestDocument = createIngestDocumentWithSourceData(createSourceDataString()); IngestDocument document = processor.execute(ingestDocument); @@ -409,7 +414,7 @@ public void testExecute_withFixedTokenLength_andSourceDataString_successful() { } @SneakyThrows - public void testExecute_withFixedTokenLength_andSourceDataInvalidType_failure() { + public void testExecute_withFixedTokenLength_andSourceDataInvalidType_thenFail() { DocumentChunkingProcessor processor = createFixedTokenLengthInstance(createStringFieldMap()); Map sourceAndMetadata = new HashMap<>(); sourceAndMetadata.put(INPUT_FIELD, 1); @@ -426,7 +431,7 @@ public void testExecute_withFixedTokenLength_andSourceDataInvalidType_failure() } @SneakyThrows - public void testExecute_withFixedTokenLength_andSourceDataListStrings_successful() { + public void testExecute_withFixedTokenLength_andSourceDataListStrings_thenSucceed() { DocumentChunkingProcessor processor = createFixedTokenLengthInstance(createStringFieldMap()); IngestDocument ingestDocument = createIngestDocumentWithSourceData(createSourceDataListStrings()); IngestDocument document = processor.execute(ingestDocument); @@ -445,7 +450,7 @@ public void testExecute_withFixedTokenLength_andSourceDataListStrings_successful } @SneakyThrows - public void testExecute_withFixedTokenLength_andSourceDataListHybridType_failure() { + public void testExecute_withFixedTokenLength_andSourceDataListHybridType_thenFail() { DocumentChunkingProcessor processor = createFixedTokenLengthInstance(createStringFieldMap()); IngestDocument ingestDocument = createIngestDocumentWithSourceData(createSourceDataListHybridType()); IllegalArgumentException illegalArgumentException = assertThrows( @@ -459,7 +464,7 @@ public void testExecute_withFixedTokenLength_andSourceDataListHybridType_failure } @SneakyThrows - public void testExecute_withFixedTokenLength_andSourceDataListWithNull_failure() { + public void testExecute_withFixedTokenLength_andSourceDataListWithNull_thenFail() { DocumentChunkingProcessor processor = createFixedTokenLengthInstance(createStringFieldMap()); IngestDocument ingestDocument = createIngestDocumentWithSourceData(createSourceDataListWithNull()); IllegalArgumentException illegalArgumentException = assertThrows( @@ -471,7 +476,7 @@ public void testExecute_withFixedTokenLength_andSourceDataListWithNull_failure() @SuppressWarnings("unchecked") @SneakyThrows - public void testExecute_withFixedTokenLength_andFieldMapNestedMap_successful() { + public void testExecute_withFixedTokenLength_andFieldMapNestedMap_thenSucceed() { DocumentChunkingProcessor processor = createFixedTokenLengthInstance(createNestedFieldMap()); IngestDocument ingestDocument = createIngestDocumentWithNestedSourceData(createSourceDataNestedMap()); IngestDocument document = processor.execute(ingestDocument); @@ -490,7 +495,7 @@ public void testExecute_withFixedTokenLength_andFieldMapNestedMap_successful() { } @SneakyThrows - public void testExecute_withFixedTokenLength_andMaxDepthLimitExceedFieldMap_failure() { + public void testExecute_withFixedTokenLength_andMaxDepthLimitExceedFieldMap_thenFail() { DocumentChunkingProcessor processor = createFixedTokenLengthInstance(createNestedFieldMap()); IngestDocument ingestDocument = createIngestDocumentWithNestedSourceData(createMaxDepthLimitExceedMap(0)); IllegalArgumentException illegalArgumentException = assertThrows( @@ -504,7 +509,7 @@ public void testExecute_withFixedTokenLength_andMaxDepthLimitExceedFieldMap_fail } @SneakyThrows - public void testExecute_withFixedTokenLength_andFieldMapNestedMap_failure() { + public void testExecute_withFixedTokenLength_andFieldMapNestedMap_thenFail() { DocumentChunkingProcessor processor = createFixedTokenLengthInstance(createNestedFieldMap()); IngestDocument ingestDocument = createIngestDocumentWithNestedSourceData(createSourceDataInvalidNestedMap()); IllegalArgumentException illegalArgumentException = assertThrows( @@ -519,7 +524,7 @@ public void testExecute_withFixedTokenLength_andFieldMapNestedMap_failure() { @SneakyThrows @SuppressWarnings("unchecked") - public void testExecute_withFixedTokenLength_andFieldMapNestedMap_sourceList_successful() { + public void testExecute_withFixedTokenLength_andFieldMapNestedMap_sourceList_thenSucceed() { DocumentChunkingProcessor processor = createFixedTokenLengthInstance(createNestedFieldMap()); IngestDocument ingestDocument = createIngestDocumentWithNestedSourceData(createSourceDataListNestedMap()); IngestDocument document = processor.execute(ingestDocument); @@ -542,7 +547,7 @@ public void testExecute_withFixedTokenLength_andFieldMapNestedMap_sourceList_suc } @SneakyThrows - public void testExecute_withDelimiter_andSourceDataString_successful() { + public void testExecute_withDelimiter_andSourceDataString_thenSucceed() { DocumentChunkingProcessor processor = createDelimiterInstance(); IngestDocument ingestDocument = createIngestDocumentWithSourceData(createSourceDataString()); IngestDocument document = processor.execute(ingestDocument); diff --git a/src/test/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunkerTests.java b/src/test/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunkerTests.java index a1fa4185c..1245f2a71 100644 --- a/src/test/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunkerTests.java +++ b/src/test/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunkerTests.java @@ -26,7 +26,6 @@ public void testChunkerWithDelimiterFieldNotString() { public void testChunkerWithDelimiterFieldNoString() { DelimiterChunker chunker = new DelimiterChunker(); - String content = "a\nb\nc\nd"; Map inputParameters = Map.of(DELIMITER_FIELD, ""); Exception exception = assertThrows(IllegalArgumentException.class, () -> chunker.validateParameters(inputParameters)); Assert.assertEquals("delimiter parameters should not be empty.", exception.getMessage()); @@ -87,5 +86,4 @@ public void testChunkerWithStringDelimiter() { List chunkResult = chunker.chunk(content, inputParameters); assertEquals(List.of("\n\n", "a\n\n", "\n"), chunkResult); } - } diff --git a/src/test/java/org/opensearch/neuralsearch/processor/factory/DocumentChunkingProcessorFactoryTests.java b/src/test/java/org/opensearch/neuralsearch/processor/factory/DocumentChunkingProcessorFactoryTests.java new file mode 100644 index 000000000..8fb8e1421 --- /dev/null +++ b/src/test/java/org/opensearch/neuralsearch/processor/factory/DocumentChunkingProcessorFactoryTests.java @@ -0,0 +1,114 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ +package org.opensearch.neuralsearch.processor.factory; + +import lombok.SneakyThrows; +import org.apache.lucene.tests.analysis.MockTokenizer; +import org.junit.Before; +import org.opensearch.cluster.service.ClusterService; +import org.opensearch.common.settings.Settings; +import org.opensearch.env.Environment; +import org.opensearch.env.TestEnvironment; +import org.opensearch.index.analysis.AnalysisRegistry; +import org.opensearch.index.analysis.TokenizerFactory; +import org.opensearch.indices.IndicesService; +import org.opensearch.indices.analysis.AnalysisModule; +import org.opensearch.ingest.Processor; +import org.opensearch.neuralsearch.processor.DocumentChunkingProcessor; +import org.opensearch.neuralsearch.processor.chunker.ChunkerFactory; +import org.opensearch.plugins.AnalysisPlugin; +import org.opensearch.test.OpenSearchTestCase; + +import java.util.HashMap; +import java.util.Map; + +import static java.util.Collections.singletonList; +import static java.util.Collections.singletonMap; +import static org.mockito.Mockito.mock; +import static org.opensearch.neuralsearch.processor.DocumentChunkingProcessor.TYPE; +import static org.opensearch.neuralsearch.processor.DocumentChunkingProcessor.FIELD_MAP_FIELD; +import static org.opensearch.neuralsearch.processor.DocumentChunkingProcessor.ALGORITHM_FIELD; + +public class DocumentChunkingProcessorFactoryTests extends OpenSearchTestCase { + + private static final String PROCESSOR_TAG = "mockTag"; + private static final String DESCRIPTION = "mockDescription"; + private static final Map algorithmMap = Map.of(ChunkerFactory.FIXED_LENGTH_ALGORITHM, new HashMap<>()); + + private DocumentChunkingProcessorFactory documentChunkingProcessorFactory; + + @SneakyThrows + private AnalysisRegistry getAnalysisRegistry() { + Settings settings = Settings.builder().put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()).build(); + Environment environment = TestEnvironment.newEnvironment(settings); + AnalysisPlugin plugin = new AnalysisPlugin() { + + @Override + public Map> getTokenizers() { + return singletonMap( + "keyword", + (indexSettings, environment, name, settings) -> TokenizerFactory.newFactory( + name, + () -> new MockTokenizer(MockTokenizer.KEYWORD, false) + ) + ); + } + }; + return new AnalysisModule(environment, singletonList(plugin)).getAnalysisRegistry(); + } + + @Before + public void setup() { + Environment environment = mock(Environment.class); + ClusterService clusterService = mock(ClusterService.class); + IndicesService indicesService = mock(IndicesService.class); + this.documentChunkingProcessorFactory = new DocumentChunkingProcessorFactory( + environment, + clusterService, + indicesService, + getAnalysisRegistry() + ); + } + + @SneakyThrows + public void testDocumentChunkingProcessorFactory_whenAllParamsPassed_thenSuccessful() { + final Map processorFactories = new HashMap<>(); + Map config = new HashMap<>(); + config.put(ALGORITHM_FIELD, algorithmMap); + config.put(FIELD_MAP_FIELD, new HashMap<>()); + DocumentChunkingProcessor documentChunkingProcessor = documentChunkingProcessorFactory.create( + processorFactories, + PROCESSOR_TAG, + DESCRIPTION, + config + ); + assertNotNull(documentChunkingProcessor); + assertEquals(TYPE, documentChunkingProcessor.getType()); + } + + @SneakyThrows + public void testDocumentChunkingProcessorFactory_whenOnlyFieldMap_thenFail() { + final Map processorFactories = new HashMap<>(); + Map config = new HashMap<>(); + config.put(FIELD_MAP_FIELD, new HashMap<>()); + Exception exception = assertThrows( + Exception.class, + () -> documentChunkingProcessorFactory.create(processorFactories, PROCESSOR_TAG, DESCRIPTION, config) + ); + assertEquals("[" + ALGORITHM_FIELD + "] required property is missing", exception.getMessage()); + } + + @SneakyThrows + public void testDocumentChunkingProcessorFactory_whenOnlyAlgorithm_thenFail() { + final Map processorFactories = new HashMap<>(); + Map config = new HashMap<>(); + config.put(ALGORITHM_FIELD, algorithmMap); + Exception exception = assertThrows( + Exception.class, + () -> documentChunkingProcessorFactory.create(processorFactories, PROCESSOR_TAG, DESCRIPTION, config) + ); + assertEquals("[" + FIELD_MAP_FIELD + "] required property is missing", exception.getMessage()); + } +}