Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

refactor: fix and improve utf8 utilities #1046

Merged
merged 4 commits into from
Dec 20, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 22 additions & 8 deletions include/dpp/utility.h
Original file line number Diff line number Diff line change
Expand Up @@ -738,22 +738,36 @@ uint32_t DPP_EXPORT hsl(int h, int s, int l);
std::string DPP_EXPORT debug_dump(uint8_t* data, size_t length);

/**
* @brief Returns the length of a UTF-8 string in codepoints
*
* @brief Returns the length of a UTF-8 string in codepoints.
* @note Result is unspecified for strings that are not valid UTF-8.
*
* @param str string to count length of
* @return size_t length of string (0 for invalid utf8)
* @return size_t Length of string
*/
size_t DPP_EXPORT utf8len(const std::string &str);
size_t DPP_EXPORT utf8len(std::string_view str);

/**
* @brief Return substring of a UTF-8 encoded string in codepoints
*
* @brief Return subview of a UTF-8 encoded string in codepoints.
* @note You must ensure that the resulting view is not used after the lifetime of the viewed string has ended.
* @note Result is unspecified for strings that are not valid UTF-8.
*
* @param str string to return substring from
* @param start start codepoint offset
* @param length length in codepoints
* @return std::string_view The requested subview
*/
std::string_view DPP_EXPORT utf8subview(std::string_view str, size_t start, size_t length);

/**
* @brief Return substring of a UTF-8 encoded string in codepoints.
* @note Result is unspecified for strings that are not valid UTF-8.
*
* @param str string to return substring from
* @param start start codepoint offset
* @param length length in codepoints
* @return std::string Substring in UTF-8 or empty string if invalid UTF-8 passed in
* @return std::string The requested substring
*/
std::string DPP_EXPORT utf8substr(const std::string& str, std::string::size_type start, std::string::size_type length);
std::string DPP_EXPORT utf8substr(std::string_view str, size_t start, size_t length);

/**
* @brief Read a whole file into a std::string.
Expand Down
2 changes: 1 addition & 1 deletion src/dpp/commandhandler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -168,7 +168,7 @@ bool commandhandler::string_has_prefix(std::string &str)
{
for (auto& p : prefixes) {
size_t prefix_length = utility::utf8len(p);
if (utility::utf8substr(str, 0, prefix_length) == p) {
if (utility::utf8subview(str, 0, prefix_length) == p) {
str.erase(str.begin(), str.begin() + prefix_length);
return true;
}
Expand Down
108 changes: 46 additions & 62 deletions src/dpp/utility.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -482,83 +482,67 @@ void exec(const std::string& cmd, std::vector<std::string> parameters, cmd_resul
t.detach();
}

size_t utf8len(const std::string &str)
{
size_t i = 0, iBefore = 0, count = 0;
const char* s = str.c_str();
if (*s == 0) {
return 0;
}
size_t utf8len(std::string_view str) {
const size_t raw_len = str.length();
size_t pos = 0;
size_t code_points = 0;

while (s[i] > 0) {
ascii:
i++;
}
while (pos != raw_len) {
const unsigned char cur = str[pos];

count += i - iBefore;
size_t code_point_len = 1;
code_point_len += static_cast<size_t>(cur >= 0b11000000);
code_point_len += static_cast<size_t>(cur >= 0b11100000);
code_point_len += static_cast<size_t>(cur >= 0b11110000);

while (s[i]) {
if (s[i] > 0) {
iBefore = i;
goto ascii;
} else {
switch (0xF0 & s[i]) {
case 0xE0:
i += 3;
break;
case 0xF0:
i += 4;
break;
default:
i += 2;
break;
}
if (raw_len - pos < code_point_len) {
return 0; // invalid utf8, avoid going past the end
}
pos += code_point_len;

count++;
code_points += 1;
}

return count;
return code_points;
}

std::string utf8substr(const std::string& str, std::string::size_type start, std::string::size_type leng)
{
if (leng == 0) {
return "";
}
if (start == 0 && leng >= utf8len(str)) {
return str;
}
std::string::size_type i, ix, q, min = std::string::npos, max = std::string::npos;
for (q = 0, i = 0, ix = str.length(); i < ix; i++, q++) {
if (q == start) {
min = i;
std::string_view utf8subview(std::string_view str, size_t start, size_t length) {
const size_t raw_len = str.length();
size_t pos = 0;
size_t code_points = 0;

size_t subview_start = raw_len;
size_t subview_len = std::string_view::npos;

while (pos != raw_len) {
if (code_points == start) {
subview_start = pos;
}
if (q <= start + leng || leng == std::string::npos) {
max = i;
if (code_points == start + length) {
subview_len = pos - subview_start;
break; // no point in traversing the remainder of the string
}

unsigned char c = (unsigned char)str[i];
if (c < 0x80) {
i += 0;
} else if ((c & 0xE0) == 0xC0) {
i += 1;
} else if ((c & 0xF0) == 0xE0) {
i += 2;
} else if ((c & 0xF8) == 0xF0) {
i += 3;
} else {
return ""; //invalid utf8
const unsigned char cur = str[pos];

size_t code_point_len = 1;
code_point_len += static_cast<size_t>(cur >= 0b11000000);
code_point_len += static_cast<size_t>(cur >= 0b11100000);
code_point_len += static_cast<size_t>(cur >= 0b11110000);

if (raw_len - pos < code_point_len) {
return ""; // invalid utf8, avoid going past the end
}
pos += code_point_len;

code_points += 1;
}
if (q <= start + leng || leng == std::string::npos) {
max = i;
}
if (min == std::string::npos || max == std::string::npos) {
return "";
}

return str.substr(min, max);
return str.substr(subview_start, subview_len);
}

std::string utf8substr(std::string_view str, size_t start, size_t length) {
return std::string(utf8subview(str, start, length));
}

std::string read_file(const std::string& filename)
Expand Down