Add SP Space handling for decoder

apaniukov · Nov 20, 2023 · d34d401 · d34d401
1 parent 6c3bae3
commit d34d401
Show file tree

Hide file tree

Showing 4 changed files with 29 additions and 1 deletion.
diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/hf_parser.py b/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/hf_parser.py
@@ -437,6 +437,9 @@ def get_sp_decoder(sp_model_node: Node, streaming_decoder: bool = False) -> Mode
         [sp_model_node, token_ids],
     ).outputs()
 
+    if streaming_decoder:
+        decoder = RegexDecodingStep.replace_sp_spaces().get_ov_subgraph(decoder)
+
     string_output = factory.create("StringTensorPack", decoder).outputs()
     string_output[0].tensor.add_names({STRING_OUTPUT_NAME})
     tokenizer_decoder = Model(string_output, [token_ids], TOKENIZER_DECODER_NAME)

diff --git a/.../custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/tokenizer_pipeline.py b/.../custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/tokenizer_pipeline.py
@@ -692,6 +692,13 @@ def get_ov_subgraph(self, input_nodes: List[Output]) -> List[Output]:
         )
         return factory.create("RegexNormalization", input_nodes).outputs()
 
+    @classmethod
+    def replace_sp_spaces(cls) -> "RegexDecodingStep":
+        return cls(
+            regex_search_pattern="▁",
+            replace_term=" ",
+        )
+
 
 @dataclass
 class TokenizerPipeline:

diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/python/tests/pass_rates.json b/modules/custom_operations/user_ie_extensions/tokenizer/python/tests/pass_rates.json
@@ -6,5 +6,5 @@
     "tokenizers_test.py::test_hf_bpe_tokenizers_outputs": 0.88,
     "tokenizers_test.py::test_bpe_detokenizer": 0.9529411764705882,
     "tokenizers_test.py::test_tiktoken_tokenizers": 0.9,
-    "tokenizers_test.py::test_": 0.8124118476727785
+    "tokenizers_test.py::test_": 0.825187969924812
 }
diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/python/tests/tokenizers_test.py b/modules/custom_operations/user_ie_extensions/tokenizer/python/tests/tokenizers_test.py
@@ -7,6 +7,7 @@
 # os.environ["OV_TOKENIZER_PREBUILD_EXTENSION_PATH"] = "path/to/libuser_ov_extensions.so"
 
 import numpy as np
+import openvino
 import pytest
 from openvino import Core
 from transformers import AutoTokenizer
@@ -321,3 +322,20 @@ def test_tiktoken_detokenizer(tiktoken_detokenizers, test_string):
     ov_output = unpack_strings(ov_detokenizer(token_ids.astype("int32"))["string_output"])
 
     assert ov_output == hf_output
+
+
+def test_streaming_detokenizer():
+    hf_tokenizer = AutoTokenizer.from_pretrained("openlm-research/open_llama_3b_v2")
+    _, ov_detokenizer = convert_tokenizer(hf_tokenizer, with_decoder=True, streaming_decoder=True)
+    ov_detokenizer = core.compile_model(ov_detokenizer)
+
+    test_string = "this is a test string"
+    tokenized_string = hf_tokenizer(test_string).input_ids
+    hf_detokenized = hf_tokenizer.decode(tokenized_string)
+
+    detokenized_string = ""
+    for token in tokenized_string:
+        ov_output = unpack_strings(ov_detokenizer(np.atleast_2d(token))["string_output"])[0]
+        detokenized_string += ov_output
+
+    assert detokenized_string == hf_detokenized