Skip to content

Commit

Permalink
[ML] Fix deberta tokenizer bug caused by bug in normalizer (elastic#1…
Browse files Browse the repository at this point in the history
…17189)

* Fix deberta tokenizer bug caused by bug in normalizer which caused offesets to be negative

* Update docs/changelog/117189.yaml
  • Loading branch information
maxhniebergall authored Nov 21, 2024
1 parent 1a4b3d3 commit 5500a5e
Show file tree
Hide file tree
Showing 3 changed files with 20 additions and 1 deletion.
5 changes: 5 additions & 0 deletions docs/changelog/117189.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
pr: 117189
summary: Fix deberta tokenizer bug caused by bug in normalizer
area: Machine Learning
type: bug
issues: []
Original file line number Diff line number Diff line change
Expand Up @@ -194,7 +194,7 @@ Reader normalize(CharSequence str) {
if (charDelta < 0) {
// normalised form is shorter
int lastDiff = getLastCumulativeDiff();
addOffCorrectMap(normalizedCharPos, lastDiff + charDelta);
addOffCorrectMap(normalizedCharPos, lastDiff - charDelta);
} else if (charDelta > 0) {
// inserted chars, add the offset in the output stream
int lastDiff = getLastCumulativeDiff();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,20 @@ public void testTokenize() throws IOException {
}
}

public void testTokenizeWithHiddenControlCharacters() throws IOException {
try (
DebertaV2Tokenizer tokenizer = DebertaV2Tokenizer.builder(
TEST_CASE_VOCAB,
TEST_CASE_SCORES,
new DebertaV2Tokenization(false, false, null, Tokenization.Truncate.NONE, -1)
).build()
) {
TokenizationResult.Tokens tokenization = tokenizer.tokenize("\u009F\u008Fz", Tokenization.Truncate.NONE, -1, 0, null).get(0);
assertThat(tokenStrings(tokenization.tokens().get(0)), contains("▁", "z"));

}
}

public void testSurrogatePair() throws IOException {
try (
DebertaV2Tokenizer tokenizer = DebertaV2Tokenizer.builder(
Expand Down

0 comments on commit 5500a5e

Please sign in to comment.