Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add max chunk limit #5

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@ public DelimiterChunker() {}

public static String DELIMITER_FIELD = "delimiter";

public static String MAX_CHUNK_LIMIT_FIELD = "max_chunk_limit";

@Override
public void validateParameters(Map<String, Object> parameters) {
if (parameters.containsKey(DELIMITER_FIELD)) {
Expand All @@ -26,11 +28,26 @@ public void validateParameters(Map<String, Object> parameters) {
} else {
throw new IllegalArgumentException("You must contain field:" + DELIMITER_FIELD + " in your parameter.");
}
if (parameters.containsKey(MAX_CHUNK_LIMIT_FIELD)) {
Object maxChunkLimit = parameters.get(MAX_CHUNK_LIMIT_FIELD);
if (!(maxChunkLimit instanceof String)) {
throw new IllegalArgumentException(
"Parameter max_chunk_limit:" + maxChunkLimit.toString() + " cannot be converted to integer."
);
} else {
try {
int maxChunkingNumber = Integer.valueOf((String) maxChunkLimit);
} catch (Exception exception) {
throw new IllegalArgumentException("Parameter max_chunk_limit:" + maxChunkLimit + " cannot be converted to integer.");
}
}
}
}

@Override
public List<String> chunk(String content, Map<String, Object> parameters) {
String delimiter = (String) parameters.get(DELIMITER_FIELD);
int maxChunkingNumber = Integer.valueOf((String) parameters.getOrDefault(MAX_CHUNK_LIMIT_FIELD, "0"));
List<String> chunkResult = new ArrayList<>();
int start = 0;
int end = content.indexOf(delimiter);
Expand All @@ -39,10 +56,16 @@ public List<String> chunk(String content, Map<String, Object> parameters) {
chunkResult.add(content.substring(start, end + delimiter.length()));
start = end + delimiter.length();
end = content.indexOf(delimiter, start);
if (chunkResult.size() > maxChunkingNumber && maxChunkingNumber > 0) {
throw new IllegalArgumentException("Exceed max chunk number: " + maxChunkingNumber);
}
}

if (start < content.length()) {
chunkResult.add(content.substring(start));
if (chunkResult.size() > maxChunkingNumber && maxChunkingNumber > 0) {
throw new IllegalArgumentException("Exceed max chunk number: " + maxChunkingNumber);
}
}
return chunkResult;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
import static junit.framework.TestCase.assertEquals;
import static org.junit.Assert.assertThrows;
import static org.opensearch.neuralsearch.processor.chunker.DelimiterChunker.DELIMITER_FIELD;
import static org.opensearch.neuralsearch.processor.chunker.DelimiterChunker.MAX_CHUNK_LIMIT_FIELD;

public class DelimiterChunkerTests extends OpenSearchTestCase {

Expand All @@ -24,6 +25,22 @@ public void testChunkerWithNoDelimiterField() {
Assert.assertEquals("You must contain field:" + DELIMITER_FIELD + " in your parameter.", exception.getMessage());
}

public void testChunkerWithWrongLimitFieldList() {
DelimiterChunker chunker = new DelimiterChunker();
String content = "a\nb\nc\nd";
Map<String, Object> inputParameters = Map.of(MAX_CHUNK_LIMIT_FIELD, List.of("-1"), DELIMITER_FIELD, "\n");
Exception exception = assertThrows(IllegalArgumentException.class, () -> chunker.validateParameters(inputParameters));
Assert.assertEquals("Parameter max_chunk_limit:" + List.of("-1") + " cannot be converted to integer.", exception.getMessage());
}

public void testChunkerWithWrongLimitField() {
DelimiterChunker chunker = new DelimiterChunker();
String content = "a\nb\nc\nd";
Map<String, Object> inputParameters = Map.of(MAX_CHUNK_LIMIT_FIELD, "1000\n", DELIMITER_FIELD, "\n");
Exception exception = assertThrows(IllegalArgumentException.class, () -> chunker.validateParameters(inputParameters));
Assert.assertEquals("Parameter max_chunk_limit:1000\n cannot be converted to integer.", exception.getMessage());
}

public void testChunkerWithDelimiterFieldNotString() {
DelimiterChunker chunker = new DelimiterChunker();
String content = "a\nb\nc\nd";
Expand All @@ -40,6 +57,14 @@ public void testChunkerWithDelimiterFieldNoString() {
Assert.assertEquals("delimiter parameters should not be empty.", exception.getMessage());
}

public void testChunkerWithLimitNumber() {
DelimiterChunker chunker = new DelimiterChunker();
String content = "a\nb\nc\nd";
Map<String, Object> inputParameters = Map.of(DELIMITER_FIELD, "\n", MAX_CHUNK_LIMIT_FIELD, "1");
Exception exception = assertThrows(IllegalArgumentException.class, () -> chunker.chunk(content, inputParameters));
Assert.assertEquals("Exceed max chunk number: 1", exception.getMessage());
}

public void testChunker() {
DelimiterChunker chunker = new DelimiterChunker();
String content = "a\nb\nc\nd";
Expand Down
Loading