Skip to content

Commit

Permalink
extract max chunk limit check to util class
Browse files Browse the repository at this point in the history
Signed-off-by: yuye-aws <[email protected]>
  • Loading branch information
yuye-aws committed Mar 17, 2024
1 parent 9702168 commit 3d8c030
Show file tree
Hide file tree
Showing 3 changed files with 48 additions and 35 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
/*
* Copyright OpenSearch Contributors
* SPDX-License-Identifier: Apache-2.0
*/
package org.opensearch.neuralsearch.processor.chunker;

import java.util.Locale;

import static org.opensearch.neuralsearch.processor.TextChunkingProcessor.TYPE;
import static org.opensearch.neuralsearch.processor.chunker.Chunker.MAX_CHUNK_LIMIT_FIELD;

/**
* A util class used by chunking algorithms.
*/
public class ChunkerUtil {

private ChunkerUtil() {} // no instance of this util class

/**
* Checks whether the chunking results would exceed the max chunk limit.
* If exceeds, then Throw IllegalStateException
*
* @param chunkResultSize the size of chunking result
* @param runtimeMaxChunkLimit runtime max_chunk_limit, used to check with chunkResultSize
* @param nonRuntimeMaxChunkLimit non-runtime max_chunk_limit, used to keep exception message consistent
*/
public static void checkRunTimeMaxChunkLimit(int chunkResultSize, int runtimeMaxChunkLimit, int nonRuntimeMaxChunkLimit) {
if (chunkResultSize == runtimeMaxChunkLimit) {
throw new IllegalStateException(
String.format(
Locale.ROOT,
"The number of chunks produced by %s processor has exceeded the allowed maximum of [%s]. This limit can be set by changing the [%s] parameter.",
TYPE,
nonRuntimeMaxChunkLimit,
MAX_CHUNK_LIMIT_FIELD
)
);
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,8 @@

import java.util.Map;
import java.util.List;
import java.util.Locale;
import java.util.ArrayList;

import static org.opensearch.neuralsearch.processor.TextChunkingProcessor.TYPE;
import static org.opensearch.neuralsearch.processor.chunker.ChunkerParameterParser.parseIntegerParameter;
import static org.opensearch.neuralsearch.processor.chunker.ChunkerParameterParser.parseStringParameter;

Expand Down Expand Up @@ -37,7 +35,7 @@ public DelimiterChunker(final Map<String, Object> parameters) {
*
* @param parameters a map with non-runtime parameters as the following:
* 1. delimiter A string as the paragraph split indicator
* 2. max_chunk_limit processor level max chunk level
* 2. max_chunk_limit processor level max chunk limit
*/
@Override
public void parseParameters(Map<String, Object> parameters) {
Expand All @@ -50,7 +48,7 @@ public void parseParameters(Map<String, Object> parameters) {
*
* @param content input string
* @param runtimeParameters a map for runtime parameters, containing the following runtime parameters:
* 1. max_chunk_level content level max chunk limit
* 1. max_chunk_limit field level max chunk limit
*/
@Override
public List<String> chunk(final String content, final Map<String, Object> runtimeParameters) {
Expand All @@ -61,33 +59,18 @@ public List<String> chunk(final String content, final Map<String, Object> runtim
int nextDelimiterPosition = content.indexOf(delimiter);

while (nextDelimiterPosition != -1) {
checkRunTimeMaxChunkLimit(chunkResult.size(), runtimeMaxChunkLimit);
ChunkerUtil.checkRunTimeMaxChunkLimit(chunkResult.size(), runtimeMaxChunkLimit, maxChunkLimit);
end = nextDelimiterPosition + delimiter.length();
chunkResult.add(content.substring(start, end));
start = end;
nextDelimiterPosition = content.indexOf(delimiter, start);
}

if (start < content.length()) {
checkRunTimeMaxChunkLimit(chunkResult.size(), runtimeMaxChunkLimit);
ChunkerUtil.checkRunTimeMaxChunkLimit(chunkResult.size(), runtimeMaxChunkLimit, maxChunkLimit);
chunkResult.add(content.substring(start));
}

return chunkResult;
}

private void checkRunTimeMaxChunkLimit(int chunkResultLength, int runtimeMaxChunkLimit) {
if (chunkResultLength == runtimeMaxChunkLimit) {
// need processorMaxChunkLimit to keep exception message consistent
throw new IllegalStateException(
String.format(
Locale.ROOT,
"The number of chunks produced by %s processor has exceeded the allowed maximum of [%s]. This limit can be set by changing the [%s] parameter.",
TYPE,
maxChunkLimit,
MAX_CHUNK_LIMIT_FIELD
)
);
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@
import org.opensearch.index.analysis.AnalysisRegistry;
import org.opensearch.action.admin.indices.analyze.AnalyzeAction;
import org.opensearch.action.admin.indices.analyze.AnalyzeAction.AnalyzeToken;
import static org.opensearch.neuralsearch.processor.TextChunkingProcessor.TYPE;
import static org.opensearch.action.admin.indices.analyze.TransportAnalyzeAction.analyze;
import static org.opensearch.neuralsearch.processor.chunker.ChunkerParameterParser.parseStringParameter;
import static org.opensearch.neuralsearch.processor.chunker.ChunkerParameterParser.parseDoubleParameter;
Expand Down Expand Up @@ -73,10 +72,12 @@ public FixedTokenLengthChunker(final Map<String, Object> parameters) {
* 1. tokenizer: the <a href="https://opensearch.org/docs/latest/analyzers/tokenizers/index/">word tokenizer</a> in opensearch
* 2. token_limit: the token limit for each chunked passage
* 3. overlap_rate: the overlapping degree for each chunked passage, indicating how many token comes from the previous passage
* 4. max_chunk_limit processor level max chunk level
* Here are requirements for non-runtime parameters:
* 1. token_limit must be a positive integer
* 2. overlap_rate must be within range [0, 0.5]
* 3. tokenizer must be a word tokenizer
*
*/
@Override
public void parseParameters(Map<String, Object> parameters) {
Expand Down Expand Up @@ -115,7 +116,7 @@ public void parseParameters(Map<String, Object> parameters) {
* @param content input string
* @param runtimeParameters a map for runtime parameters, containing the following runtime parameters:
* 1. max_token_count the max token limit for the tokenizer
* 2. runtime_max_chunk_limit runtime max chunk limit for the algorithm
* 2. max_chunk_limit field level max chunk limit
*/
@Override
public List<String> chunk(final String content, final Map<String, Object> runtimeParameters) {
Expand All @@ -130,18 +131,7 @@ public List<String> chunk(final String content, final Map<String, Object> runtim
int overlapTokenNumber = (int) Math.floor(tokenLimit * overlapRate);

while (startTokenIndex < tokens.size()) {
if (chunkResult.size() == runtimeMaxChunkLimit) {
// need processor level max chunk level to keep exception message consistent
throw new IllegalStateException(
String.format(
Locale.ROOT,
"The number of chunks produced by %s processor has exceeded the allowed maximum of [%s]. This limit can be set by changing the [%s] parameter.",
TYPE,
maxChunkLimit,
MAX_CHUNK_LIMIT_FIELD
)
);
}
ChunkerUtil.checkRunTimeMaxChunkLimit(chunkResult.size(), runtimeMaxChunkLimit, maxChunkLimit);
if (startTokenIndex == 0) {
// include all characters till the start if no previous passage
startContentPosition = 0;
Expand Down

0 comments on commit 3d8c030

Please sign in to comment.