Skip to content

Commit

Permalink
add test cases in unit test for fixed token length chunker
Browse files Browse the repository at this point in the history
Signed-off-by: yuye-aws <[email protected]>
  • Loading branch information
yuye-aws committed Feb 26, 2024
1 parent bc18788 commit 06933d2
Showing 1 changed file with 15 additions and 0 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -139,4 +139,19 @@ public void testChunk_withTokenLimit_20() {
expectedPassages.add("and 24 tokens by standard tokenizer in OpenSearch");
assertEquals(expectedPassages, passages);
}

public void testChunk_withOverlapRate_half() {
Map<String, Object> parameters = new HashMap<>();
parameters.put(TOKEN_LIMIT, 10);
parameters.put(OVERLAP_RATE, 0.5);
String content =
"This is an example document to be chunked. The document contains a single paragraph, two sentences and 24 tokens by standard tokenizer in OpenSearch.";
List<String> passages = FixedTokenLengthChunker.chunk(content, parameters);
List<String> expectedPassages = new ArrayList<>();
expectedPassages.add("This is an example document to be chunked The document");
expectedPassages.add("to be chunked The document contains a single paragraph two");
expectedPassages.add("contains a single paragraph two sentences and 24 tokens by");
expectedPassages.add("sentences and 24 tokens by standard tokenizer in OpenSearch");
assertEquals(expectedPassages, passages);
}
}

0 comments on commit 06933d2

Please sign in to comment.