Skip to content

Commit

Permalink
move analysis_registry to non-runtime parameters
Browse files Browse the repository at this point in the history
Signed-off-by: xinyual <[email protected]>
  • Loading branch information
xinyual committed Mar 11, 2024
1 parent 35588a2 commit 75badf7
Show file tree
Hide file tree
Showing 4 changed files with 16 additions and 6 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,8 @@
import org.opensearch.index.mapper.IndexFieldMapper;
import org.opensearch.neuralsearch.processor.chunker.FixedTokenLengthChunker;

import static org.opensearch.neuralsearch.processor.chunker.ChunkerFactory.FIXED_TOKEN_LENGTH_ALGORITHM;

/**
* This processor is used for chunking user input data and chunked data could be used for downstream embedding processor,
* algorithm can be used to indicate chunking algorithm and parameters,
Expand Down Expand Up @@ -111,7 +113,10 @@ private void validateAndParseAlgorithmMap(Map<String, Object> algorithmMap) {
);
}
Map<String, Object> chunkerParameters = (Map<String, Object>) algorithmValue;
this.chunker = ChunkerFactory.create(algorithmKey, analysisRegistry, chunkerParameters);
if (Objects.equals(algorithmKey, FIXED_TOKEN_LENGTH_ALGORITHM)) {
chunkerParameters.put(FixedTokenLengthChunker.ANALYSIS_REGISTRY_FIELD, analysisRegistry);
}
this.chunker = ChunkerFactory.create(algorithmKey, chunkerParameters);
if (chunkerParameters.containsKey(MAX_CHUNK_LIMIT_FIELD)) {
String maxChunkLimitString = chunkerParameters.get(MAX_CHUNK_LIMIT_FIELD).toString();
if (!(NumberUtils.isParsable(maxChunkLimitString))) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@

import org.opensearch.index.analysis.AnalysisRegistry;

import static org.opensearch.neuralsearch.processor.chunker.FixedTokenLengthChunker.ANALYSIS_REGISTRY_FIELD;

/**
* A factory to create different chunking algorithm classes and return all supported chunking algorithms.
*/
Expand All @@ -17,10 +19,10 @@ public class ChunkerFactory {
public static final String FIXED_TOKEN_LENGTH_ALGORITHM = "fixed_token_length";
public static final String DELIMITER_ALGORITHM = "delimiter";

public static Chunker create(String type, AnalysisRegistry analysisRegistry, Map<String, Object> parameters) {
public static Chunker create(String type, Map<String, Object> parameters) {
switch (type) {
case FIXED_TOKEN_LENGTH_ALGORITHM:
return new FixedTokenLengthChunker(analysisRegistry, parameters);
return new FixedTokenLengthChunker((AnalysisRegistry) parameters.get(ANALYSIS_REGISTRY_FIELD), parameters);
case DELIMITER_ALGORITHM:
return new DelimiterChunker(parameters);
default:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
@Log4j2
public class FixedTokenLengthChunker implements Chunker {

public static final String ANALYSIS_REGISTRY_FIELD = "analysis_registry";
public static final String TOKEN_LIMIT_FIELD = "token_limit";
public static final String OVERLAP_RATE_FIELD = "overlap_rate";
public static final String MAX_TOKEN_COUNT_FIELD = "max_token_count";
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@
import java.util.Map;
import java.util.Set;

import static org.opensearch.neuralsearch.processor.chunker.FixedTokenLengthChunker.ANALYSIS_REGISTRY_FIELD;

public class ChunkerFactoryTests extends OpenSearchTestCase {

@Mock
Expand All @@ -22,13 +24,13 @@ public void testGetAllChunkers() {
}

public void testCreate_FixedTokenLength() {
Chunker chunker = ChunkerFactory.create(ChunkerFactory.FIXED_TOKEN_LENGTH_ALGORITHM, analysisRegistry, Map.of());
Chunker chunker = ChunkerFactory.create(ChunkerFactory.FIXED_TOKEN_LENGTH_ALGORITHM, Map.of(ANALYSIS_REGISTRY_FIELD, analysisRegistry));
assertNotNull(chunker);
assertTrue(chunker instanceof FixedTokenLengthChunker);
}

public void testCreate_Delimiter() {
Chunker chunker = ChunkerFactory.create(ChunkerFactory.DELIMITER_ALGORITHM, analysisRegistry, Map.of());
Chunker chunker = ChunkerFactory.create(ChunkerFactory.DELIMITER_ALGORITHM, Map.of(ANALYSIS_REGISTRY_FIELD, analysisRegistry));
assertNotNull(chunker);
assertTrue(chunker instanceof DelimiterChunker);
}
Expand All @@ -37,7 +39,7 @@ public void testCreate_Invalid() {
String invalidChunkerType = "Invalid Chunker Type";
IllegalArgumentException illegalArgumentException = assertThrows(
IllegalArgumentException.class,
() -> ChunkerFactory.create(invalidChunkerType, analysisRegistry, Map.of())
() -> ChunkerFactory.create(invalidChunkerType, Map.of(ANALYSIS_REGISTRY_FIELD, analysisRegistry))
);
assert (illegalArgumentException.getMessage().contains("chunker type [" + invalidChunkerType + "] is not supported."));
}
Expand Down

0 comments on commit 75badf7

Please sign in to comment.