diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerUtil.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerUtil.java new file mode 100644 index 000000000..1fcc34768 --- /dev/null +++ b/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerUtil.java @@ -0,0 +1,40 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ +package org.opensearch.neuralsearch.processor.chunker; + +import java.util.Locale; + +import static org.opensearch.neuralsearch.processor.TextChunkingProcessor.TYPE; +import static org.opensearch.neuralsearch.processor.chunker.Chunker.MAX_CHUNK_LIMIT_FIELD; + +/** + * A util class used by chunking algorithms. + */ +public class ChunkerUtil { + + private ChunkerUtil() {} // no instance of this util class + + /** + * Checks whether the chunking results would exceed the max chunk limit. + * If exceeds, then Throw IllegalStateException + * + * @param chunkResultSize the size of chunking result + * @param runtimeMaxChunkLimit runtime max_chunk_limit, used to check with chunkResultSize + * @param nonRuntimeMaxChunkLimit non-runtime max_chunk_limit, used to keep exception message consistent + */ + public static void checkRunTimeMaxChunkLimit(int chunkResultSize, int runtimeMaxChunkLimit, int nonRuntimeMaxChunkLimit) { + if (chunkResultSize == runtimeMaxChunkLimit) { + throw new IllegalStateException( + String.format( + Locale.ROOT, + "The number of chunks produced by %s processor has exceeded the allowed maximum of [%s]. This limit can be set by changing the [%s] parameter.", + TYPE, + nonRuntimeMaxChunkLimit, + MAX_CHUNK_LIMIT_FIELD + ) + ); + } + } +} diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java index 3e9d415de..c688af436 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java @@ -6,10 +6,8 @@ import java.util.Map; import java.util.List; -import java.util.Locale; import java.util.ArrayList; -import static org.opensearch.neuralsearch.processor.TextChunkingProcessor.TYPE; import static org.opensearch.neuralsearch.processor.chunker.ChunkerParameterParser.parseIntegerParameter; import static org.opensearch.neuralsearch.processor.chunker.ChunkerParameterParser.parseStringParameter; @@ -37,7 +35,7 @@ public DelimiterChunker(final Map parameters) { * * @param parameters a map with non-runtime parameters as the following: * 1. delimiter A string as the paragraph split indicator - * 2. max_chunk_limit processor level max chunk level + * 2. max_chunk_limit processor level max chunk limit */ @Override public void parseParameters(Map parameters) { @@ -50,7 +48,7 @@ public void parseParameters(Map parameters) { * * @param content input string * @param runtimeParameters a map for runtime parameters, containing the following runtime parameters: - * 1. max_chunk_level content level max chunk limit + * 1. max_chunk_limit field level max chunk limit */ @Override public List chunk(final String content, final Map runtimeParameters) { @@ -61,7 +59,7 @@ public List chunk(final String content, final Map runtim int nextDelimiterPosition = content.indexOf(delimiter); while (nextDelimiterPosition != -1) { - checkRunTimeMaxChunkLimit(chunkResult.size(), runtimeMaxChunkLimit); + ChunkerUtil.checkRunTimeMaxChunkLimit(chunkResult.size(), runtimeMaxChunkLimit, maxChunkLimit); end = nextDelimiterPosition + delimiter.length(); chunkResult.add(content.substring(start, end)); start = end; @@ -69,25 +67,10 @@ public List chunk(final String content, final Map runtim } if (start < content.length()) { - checkRunTimeMaxChunkLimit(chunkResult.size(), runtimeMaxChunkLimit); + ChunkerUtil.checkRunTimeMaxChunkLimit(chunkResult.size(), runtimeMaxChunkLimit, maxChunkLimit); chunkResult.add(content.substring(start)); } return chunkResult; } - - private void checkRunTimeMaxChunkLimit(int chunkResultLength, int runtimeMaxChunkLimit) { - if (chunkResultLength == runtimeMaxChunkLimit) { - // need processorMaxChunkLimit to keep exception message consistent - throw new IllegalStateException( - String.format( - Locale.ROOT, - "The number of chunks produced by %s processor has exceeded the allowed maximum of [%s]. This limit can be set by changing the [%s] parameter.", - TYPE, - maxChunkLimit, - MAX_CHUNK_LIMIT_FIELD - ) - ); - } - } } diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java index 2225436d2..640fc2ab5 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java @@ -13,7 +13,6 @@ import org.opensearch.index.analysis.AnalysisRegistry; import org.opensearch.action.admin.indices.analyze.AnalyzeAction; import org.opensearch.action.admin.indices.analyze.AnalyzeAction.AnalyzeToken; -import static org.opensearch.neuralsearch.processor.TextChunkingProcessor.TYPE; import static org.opensearch.action.admin.indices.analyze.TransportAnalyzeAction.analyze; import static org.opensearch.neuralsearch.processor.chunker.ChunkerParameterParser.parseStringParameter; import static org.opensearch.neuralsearch.processor.chunker.ChunkerParameterParser.parseDoubleParameter; @@ -73,10 +72,12 @@ public FixedTokenLengthChunker(final Map parameters) { * 1. tokenizer: the word tokenizer in opensearch * 2. token_limit: the token limit for each chunked passage * 3. overlap_rate: the overlapping degree for each chunked passage, indicating how many token comes from the previous passage + * 4. max_chunk_limit processor level max chunk level * Here are requirements for non-runtime parameters: * 1. token_limit must be a positive integer * 2. overlap_rate must be within range [0, 0.5] * 3. tokenizer must be a word tokenizer + * */ @Override public void parseParameters(Map parameters) { @@ -115,7 +116,7 @@ public void parseParameters(Map parameters) { * @param content input string * @param runtimeParameters a map for runtime parameters, containing the following runtime parameters: * 1. max_token_count the max token limit for the tokenizer - * 2. runtime_max_chunk_limit runtime max chunk limit for the algorithm + * 2. max_chunk_limit field level max chunk limit */ @Override public List chunk(final String content, final Map runtimeParameters) { @@ -130,18 +131,7 @@ public List chunk(final String content, final Map runtim int overlapTokenNumber = (int) Math.floor(tokenLimit * overlapRate); while (startTokenIndex < tokens.size()) { - if (chunkResult.size() == runtimeMaxChunkLimit) { - // need processor level max chunk level to keep exception message consistent - throw new IllegalStateException( - String.format( - Locale.ROOT, - "The number of chunks produced by %s processor has exceeded the allowed maximum of [%s]. This limit can be set by changing the [%s] parameter.", - TYPE, - maxChunkLimit, - MAX_CHUNK_LIMIT_FIELD - ) - ); - } + ChunkerUtil.checkRunTimeMaxChunkLimit(chunkResult.size(), runtimeMaxChunkLimit, maxChunkLimit); if (startTokenIndex == 0) { // include all characters till the start if no previous passage startContentPosition = 0;