Skip to content

Commit

Permalink
minor changes
Browse files Browse the repository at this point in the history
  • Loading branch information
Sayan Shaw committed Jan 16, 2025
1 parent c3dc4c1 commit d6f9dc7
Show file tree
Hide file tree
Showing 4 changed files with 2 additions and 6 deletions.
2 changes: 0 additions & 2 deletions operators/tokenizer/bpe_tokenizer_model.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -417,8 +417,6 @@ class BpeModel {

if (model_name == "Llama") {
return bpe::PreTokenizerWithRegEx::LLAMA_REGEX_PATTERN;
} else if (model_name == "Phi") {
return bpe::PreTokenizerWithRegEx::PHI4_REGEX_PATTERN;
}

// by default, use the GPT2 pretokenizer regex
Expand Down
3 changes: 1 addition & 2 deletions operators/tokenizer/bpe_utils.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -97,8 +97,6 @@ class PreTokenizerWithRegEx {
R"('s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+)";
static constexpr const char LLAMA_REGEX_PATTERN[] =
R"((?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+)";
static constexpr const char PHI4_REGEX_PATTERN[] =
R"([^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]*[\p{Ll}\p{Lm}\p{Lo}\p{M}]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?|[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]+[\p{Ll}\p{Lm}\p{Lo}\p{M}]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?|?[^\s\p{L}\p{N}]+[\r\n]*)";

PreTokenizerWithRegEx() = default;

Expand Down Expand Up @@ -499,6 +497,7 @@ class PreTokenizerWithRegEx {
{R"([^\s\p{L}\p{N}]+)", &PreTokenizerWithRegEx::Match_CLIP_Pattern_2},
{R"([^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]*[\p{Ll}\p{Lm}\p{Lo}\p{M}]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?)", &PreTokenizerWithRegEx::Match_PHI4_Pattern_1},
{R"([^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]+[\p{Ll}\p{Lm}\p{Lo}\p{M}]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?)", &PreTokenizerWithRegEx::Match_PHI4_Pattern_2},
{R"(?[^\s\p{L}\p{N}]+[\r\n/]*)", &PreTokenizerWithRegEx::Match_LLAMA3_Pattern_4},
{R"(\p{N})", &PreTokenizerWithRegEx::Match_General_Pattern_1},
};

Expand Down
1 change: 0 additions & 1 deletion test/pp_api_test/test_tokenizer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,6 @@ TEST(OrtxTokenizerTest, RegexTest) {

int64_t max_length = out_tokens.size();
reg_splitter->Set(str.c_str());
auto regex_expr = reg_splitter->PHI4_REGEX_PATTERN;
auto status = reg_splitter->Compile(R"([^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]*[\p{Ll}\p{Lm}\p{Lo}\p{M}]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?)");
assert(status.IsOk());

Expand Down
2 changes: 1 addition & 1 deletion test/test_pp_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,7 @@ def test_llama3_2_image_processing(self):
a_image.save(f"{self.temp_dir}/a_{idx}_{i}.png")

# test sentence for tokenizer
tokenizer_test_sentence = "I like walking my cute dog\n and\x17 then 生活的真谛是 \t\t\t\t \n\n61"
tokenizer_test_sentence = "I like walking my cute dog\n and\x17 then 生活的真谛是 \t\t\t\t \n\n61. You'll enjoy the concert."

def test_OLMa_tokenizer(self):
test_sentence = [self.tokenizer_test_sentence + " |||IP_ADDRESS|||"]
Expand Down

0 comments on commit d6f9dc7

Please sign in to comment.