[ML] Fix deberta tokenizer bug caused by bug in normalizer (elastic#1…

…17189) * Fix deberta tokenizer bug caused by bug in normalizer which caused offesets to be negative * Update docs/changelog/117189.yaml
jimczi · Nov 21, 2024 · 5500a5e · 5500a5e
1 parent 1a4b3d3
commit 5500a5e
Show file tree

Hide file tree

Showing 3 changed files with 20 additions and 1 deletion.
diff --git a/docs/changelog/117189.yaml b/docs/changelog/117189.yaml
@@ -0,0 +1,5 @@
+pr: 117189
+summary: Fix deberta tokenizer bug caused by bug in normalizer
+area: Machine Learning
+type: bug
+issues: []
diff --git a/...ava/org/elasticsearch/xpack/ml/inference/nlp/tokenizers/PrecompiledCharMapNormalizer.java b/...ava/org/elasticsearch/xpack/ml/inference/nlp/tokenizers/PrecompiledCharMapNormalizer.java
@@ -194,7 +194,7 @@ Reader normalize(CharSequence str) {
                     if (charDelta < 0) {
                         // normalised form is shorter
                         int lastDiff = getLastCumulativeDiff();
-                        addOffCorrectMap(normalizedCharPos, lastDiff + charDelta);
+                        addOffCorrectMap(normalizedCharPos, lastDiff - charDelta);
                     } else if (charDelta > 0) {
                         // inserted chars, add the offset in the output stream
                         int lastDiff = getLastCumulativeDiff();

diff --git a/...est/java/org/elasticsearch/xpack/ml/inference/nlp/tokenizers/DebertaV2TokenizerTests.java b/...est/java/org/elasticsearch/xpack/ml/inference/nlp/tokenizers/DebertaV2TokenizerTests.java
@@ -94,6 +94,20 @@ public void testTokenize() throws IOException {
         }
     }
 
+    public void testTokenizeWithHiddenControlCharacters() throws IOException {
+        try (
+            DebertaV2Tokenizer tokenizer = DebertaV2Tokenizer.builder(
+                TEST_CASE_VOCAB,
+                TEST_CASE_SCORES,
+                new DebertaV2Tokenization(false, false, null, Tokenization.Truncate.NONE, -1)
+            ).build()
+        ) {
+            TokenizationResult.Tokens tokenization = tokenizer.tokenize("\u009F\u008Fz", Tokenization.Truncate.NONE, -1, 0, null).get(0);
+            assertThat(tokenStrings(tokenization.tokens().get(0)), contains("▁", "z"));
+
+        }
+    }
+
     public void testSurrogatePair() throws IOException {
         try (
             DebertaV2Tokenizer tokenizer = DebertaV2Tokenizer.builder(