Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
…into baijumeswani/filesystem
  • Loading branch information
baijumeswani committed May 3, 2024
2 parents a53fda9 + e82ab3d commit d144495
Show file tree
Hide file tree
Showing 4 changed files with 81 additions and 31 deletions.
33 changes: 25 additions & 8 deletions src/tokenizer/token_bpe.cc
Original file line number Diff line number Diff line change
Expand Up @@ -237,15 +237,17 @@ std::vector<tfmTokenId_t> BPETokenizer::Encode(std::string_view sv_input, int64_
text = text.strip()
*/
std::u32string str = RemoveConsecutiveSpaces(input);
if (IsUnicodeSpace(str.front())) {
str.erase(str.begin());
}
if (IsUnicodeSpace(str.back())) {
str.pop_back();
if (!str.empty()) {
if (IsUnicodeSpace(str.front())) {
str.erase(str.begin());
}
if (IsUnicodeSpace(str.back())) {
str.pop_back();
}
// remove newlines as CLIP ignores them (treats them as whitespace which is then cleaned)
str.erase(std::remove(str.begin(), str.end(), U'\n'), str.end());
str.erase(std::remove(str.begin(), str.end(), U'\r'), str.end());
}
// remove newlines as CLIP ignores them (treats them as whitespace which is then cleaned)
str.erase(std::remove(str.begin(), str.end(), U'\n'), str.end());
str.erase(std::remove(str.begin(), str.end(), U'\r'), str.end());
input = str;
}

Expand Down Expand Up @@ -592,6 +594,21 @@ TfmStatus BPETokenizer::Id2Token(tfmTokenId_t id, std::string& token, DecoderSta
token.push_back(' ');
}
} // end case of whitespace_token_

bpe_state->incomplete_utf8_ += token;
token.clear();
std::string& s_utf8 = bpe_state->incomplete_utf8_;
size_t utf8_len = 1;
size_t utf8_all_len = 0;
for (size_t i = 0; i < s_utf8.size(); i += utf8_len) {
utf8_len = UTF8Len(s_utf8[i]);
if (utf8_len <= s_utf8.size() - i) {
utf8_all_len += utf8_len;
auto _t = s_utf8.substr(i, utf8_len);
token += ValidateUTF8(_t) ? _t : "";
}
}
s_utf8 = s_utf8.substr(utf8_all_len);
}

return status;
Expand Down
1 change: 1 addition & 0 deletions src/tokenizer/token_bpe.h
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ class BPETokenizer : public TokenizerImpl {
BPEDeocerState() = default;
~BPEDeocerState() override = default;
bool f_special_last;
std::string incomplete_utf8_;
};

public:
Expand Down
9 changes: 4 additions & 5 deletions src/tokenizer/tokenizer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -30,10 +30,10 @@ TfmStatus CreateBPETokenizer(const std::string& tokenizer_path,
if (type.empty()) {
if (BPETokenizer::IsSupportedModel(GetModelName(token_cfg->tokenizer_class_))) {
type = "BPE";
} else if (fs::exists(tokenizer_path + "/tokenizer.model")) {
} /* else if (fs::exists(tokenizer_path + "/tokenizer.model")) {
// if 'tokenizer.model exists in the tokenizer_path, then it is a sentencepiece model
type = "SPM";
} else {
} */ else {
status = TfmStatus(kTfmErrorInvalidArgument, "Cannot determine the tokenizer type from tokenizer_path argument");
}
}
Expand All @@ -42,9 +42,8 @@ TfmStatus CreateBPETokenizer(const std::string& tokenizer_path,
token_ptr = std::make_unique<BPETokenizer>();
} /* else if (type == "SPM") {
token_ptr = std::make_unique<SpmTokenizer>();
} */
else {
status = TfmStatus(kTfmErrorInvalidArgument, "Unknown tokenizer_type, (BPE, SPM, RKWV) are supported.");
} */ else {
status = TfmStatus(kTfmErrorInvalidArgument, "Unknown tokenizer_type, (BPE, RKWV) are supported.");
}

if (status.ok()) {
Expand Down
69 changes: 51 additions & 18 deletions src/tokenizer/utils/unescape.cc
Original file line number Diff line number Diff line change
Expand Up @@ -41,27 +41,60 @@ std::string EncodeUTF8Char(char32_t utf8_char) {
return {utf8_buf};
}

bool ValidateUTF8(const std::string& data) {
int cnt = 0;
for (size_t i = 0; i < data.size(); i++) {
int x = data[i];
if (!cnt) {
if ((x >> 5) == 0b110) {
cnt = 1;
} else if ((x >> 4) == 0b1110) {
cnt = 2;
} else if ((x >> 3) == 0b11110) {
cnt = 3;
} else if ((x >> 7) != 0) {
bool ValidateUTF8(const std::string& data) {
const unsigned char* s = reinterpret_cast<const unsigned char*>(data.c_str());
const unsigned char* s_end = s + data.size();
if (*s_end != '\0')
return false;

while (*s) {
if (*s < 0x80)
/* 0xxxxxxx */
s++;
else if ((s[0] & 0xe0) == 0xc0) {
/* 110XXXXx 10xxxxxx */
if (s + 1 >= s_end) {
return false;
}
if ((s[1] & 0xc0) != 0x80 ||
(s[0] & 0xfe) == 0xc0) /* overlong? */
return false;
else
s += 2;
} else if ((s[0] & 0xf0) == 0xe0) {
/* 1110XXXX 10Xxxxxx 10xxxxxx */
if (s + 2 >= s_end) {
return false;
}
if ((s[1] & 0xc0) != 0x80 ||
(s[2] & 0xc0) != 0x80 ||
(s[0] == 0xe0 && (s[1] & 0xe0) == 0x80) || /* overlong? */
(s[0] == 0xed && (s[1] & 0xe0) == 0xa0) || /* surrogate? */
(s[0] == 0xef && s[1] == 0xbf &&
(s[2] & 0xfe) == 0xbe)) /* U+FFFE or U+FFFF? */
return false;
else
s += 3;
} else if ((s[0] & 0xf8) == 0xf0) {
/* 11110XXX 10XXxxxx 10xxxxxx 10xxxxxx */
if (s + 3 >= s_end) {
return false;
}
if ((s[1] & 0xc0) != 0x80 ||
(s[2] & 0xc0) != 0x80 ||
(s[3] & 0xc0) != 0x80 ||
(s[0] == 0xf0 && (s[1] & 0xf0) == 0x80) || /* overlong? */
(s[0] == 0xf4 && s[1] > 0x8f) || s[0] > 0xf4) /* > U+10FFFF? */
return false;
else
s += 4;
} else
return false;
}
} else {
if ((x >> 6) != 0b10) return false;
cnt--;
}

return true;
}
return cnt == 0;
}


bool IsDigit(char c) { return c >= '0' && c <= '9'; }
bool IsHexDigit(char c) { return (c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F'); }
Expand Down

0 comments on commit d144495

Please sign in to comment.