Skip to content

Commit

Permalink
refactor: fix and improve utf8 utilities (#1046)
Browse files Browse the repository at this point in the history
  • Loading branch information
rept1d authored Dec 20, 2023
1 parent 912f8b3 commit 8144480
Show file tree
Hide file tree
Showing 3 changed files with 69 additions and 71 deletions.
30 changes: 22 additions & 8 deletions include/dpp/utility.h
Original file line number Diff line number Diff line change
Expand Up @@ -738,22 +738,36 @@ uint32_t DPP_EXPORT hsl(int h, int s, int l);
std::string DPP_EXPORT debug_dump(uint8_t* data, size_t length);

/**
* @brief Returns the length of a UTF-8 string in codepoints
*
* @brief Returns the length of a UTF-8 string in codepoints.
* @note Result is unspecified for strings that are not valid UTF-8.
*
* @param str string to count length of
* @return size_t length of string (0 for invalid utf8)
* @return size_t Length of string
*/
size_t DPP_EXPORT utf8len(const std::string &str);
size_t DPP_EXPORT utf8len(std::string_view str);

/**
* @brief Return substring of a UTF-8 encoded string in codepoints
*
* @brief Return subview of a UTF-8 encoded string in codepoints.
* @note You must ensure that the resulting view is not used after the lifetime of the viewed string has ended.
* @note Result is unspecified for strings that are not valid UTF-8.
*
* @param str string to return substring from
* @param start start codepoint offset
* @param length length in codepoints
* @return std::string_view The requested subview
*/
std::string_view DPP_EXPORT utf8subview(std::string_view str, size_t start, size_t length);

/**
* @brief Return substring of a UTF-8 encoded string in codepoints.
* @note Result is unspecified for strings that are not valid UTF-8.
*
* @param str string to return substring from
* @param start start codepoint offset
* @param length length in codepoints
* @return std::string Substring in UTF-8 or empty string if invalid UTF-8 passed in
* @return std::string The requested substring
*/
std::string DPP_EXPORT utf8substr(const std::string& str, std::string::size_type start, std::string::size_type length);
std::string DPP_EXPORT utf8substr(std::string_view str, size_t start, size_t length);

/**
* @brief Read a whole file into a std::string.
Expand Down
2 changes: 1 addition & 1 deletion src/dpp/commandhandler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -168,7 +168,7 @@ bool commandhandler::string_has_prefix(std::string &str)
{
for (auto& p : prefixes) {
size_t prefix_length = utility::utf8len(p);
if (utility::utf8substr(str, 0, prefix_length) == p) {
if (utility::utf8subview(str, 0, prefix_length) == p) {
str.erase(str.begin(), str.begin() + prefix_length);
return true;
}
Expand Down
108 changes: 46 additions & 62 deletions src/dpp/utility.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -482,83 +482,67 @@ void exec(const std::string& cmd, std::vector<std::string> parameters, cmd_resul
t.detach();
}

size_t utf8len(const std::string &str)
{
size_t i = 0, iBefore = 0, count = 0;
const char* s = str.c_str();
if (*s == 0) {
return 0;
}
size_t utf8len(std::string_view str) {
const size_t raw_len = str.length();
size_t pos = 0;
size_t code_points = 0;

while (s[i] > 0) {
ascii:
i++;
}
while (pos != raw_len) {
const unsigned char cur = str[pos];

count += i - iBefore;
size_t code_point_len = 1;
code_point_len += static_cast<size_t>(cur >= 0b11000000);
code_point_len += static_cast<size_t>(cur >= 0b11100000);
code_point_len += static_cast<size_t>(cur >= 0b11110000);

while (s[i]) {
if (s[i] > 0) {
iBefore = i;
goto ascii;
} else {
switch (0xF0 & s[i]) {
case 0xE0:
i += 3;
break;
case 0xF0:
i += 4;
break;
default:
i += 2;
break;
}
if (raw_len - pos < code_point_len) {
return 0; // invalid utf8, avoid going past the end
}
pos += code_point_len;

count++;
code_points += 1;
}

return count;
return code_points;
}

std::string utf8substr(const std::string& str, std::string::size_type start, std::string::size_type leng)
{
if (leng == 0) {
return "";
}
if (start == 0 && leng >= utf8len(str)) {
return str;
}
std::string::size_type i, ix, q, min = std::string::npos, max = std::string::npos;
for (q = 0, i = 0, ix = str.length(); i < ix; i++, q++) {
if (q == start) {
min = i;
std::string_view utf8subview(std::string_view str, size_t start, size_t length) {
const size_t raw_len = str.length();
size_t pos = 0;
size_t code_points = 0;

size_t subview_start = raw_len;
size_t subview_len = std::string_view::npos;

while (pos != raw_len) {
if (code_points == start) {
subview_start = pos;
}
if (q <= start + leng || leng == std::string::npos) {
max = i;
if (code_points == start + length) {
subview_len = pos - subview_start;
break; // no point in traversing the remainder of the string
}

unsigned char c = (unsigned char)str[i];
if (c < 0x80) {
i += 0;
} else if ((c & 0xE0) == 0xC0) {
i += 1;
} else if ((c & 0xF0) == 0xE0) {
i += 2;
} else if ((c & 0xF8) == 0xF0) {
i += 3;
} else {
return ""; //invalid utf8
const unsigned char cur = str[pos];

size_t code_point_len = 1;
code_point_len += static_cast<size_t>(cur >= 0b11000000);
code_point_len += static_cast<size_t>(cur >= 0b11100000);
code_point_len += static_cast<size_t>(cur >= 0b11110000);

if (raw_len - pos < code_point_len) {
return ""; // invalid utf8, avoid going past the end
}
pos += code_point_len;

code_points += 1;
}
if (q <= start + leng || leng == std::string::npos) {
max = i;
}
if (min == std::string::npos || max == std::string::npos) {
return "";
}

return str.substr(min, max);
return str.substr(subview_start, subview_len);
}

std::string utf8substr(std::string_view str, size_t start, size_t length) {
return std::string(utf8subview(str, start, length));
}

std::string read_file(const std::string& filename)
Expand Down

0 comments on commit 8144480

Please sign in to comment.