From 772ecb9c7f9ac2513d59ee3b133743745dfdc0de Mon Sep 17 00:00:00 2001 From: rept1d Date: Tue, 19 Dec 2023 19:54:45 +0300 Subject: [PATCH 1/4] refactor: utf8len --- include/dpp/utility.h | 9 ++++---- src/dpp/utility.cpp | 52 +++++++++++++++---------------------------- 2 files changed, 23 insertions(+), 38 deletions(-) diff --git a/include/dpp/utility.h b/include/dpp/utility.h index 55ec9e83b5..892b76e9eb 100644 --- a/include/dpp/utility.h +++ b/include/dpp/utility.h @@ -738,12 +738,13 @@ uint32_t DPP_EXPORT hsl(int h, int s, int l); std::string DPP_EXPORT debug_dump(uint8_t* data, size_t length); /** - * @brief Returns the length of a UTF-8 string in codepoints - * + * @brief Returns the length of a UTF-8 string in codepoints. + * @note Result is unspecified for strings that are not valid UTF-8. + * * @param str string to count length of - * @return size_t length of string (0 for invalid utf8) + * @return size_t Length of string */ -size_t DPP_EXPORT utf8len(const std::string &str); +size_t DPP_EXPORT utf8len(std::string_view str); /** * @brief Return substring of a UTF-8 encoded string in codepoints diff --git a/src/dpp/utility.cpp b/src/dpp/utility.cpp index 07638d1201..a6a44436ff 100644 --- a/src/dpp/utility.cpp +++ b/src/dpp/utility.cpp @@ -482,43 +482,27 @@ void exec(const std::string& cmd, std::vector parameters, cmd_resul t.detach(); } -size_t utf8len(const std::string &str) -{ - size_t i = 0, iBefore = 0, count = 0; - const char* s = str.c_str(); - if (*s == 0) { - return 0; - } - - while (s[i] > 0) { -ascii: - i++; - } - - count += i - iBefore; - - while (s[i]) { - if (s[i] > 0) { - iBefore = i; - goto ascii; - } else { - switch (0xF0 & s[i]) { - case 0xE0: - i += 3; - break; - case 0xF0: - i += 4; - break; - default: - i += 2; - break; - } +size_t utf8len(std::string_view str) { + /* Shouldn't rely on signedness of char, better cast to unsigned char */ + const auto* const s = reinterpret_cast(str.data()); + + const size_t raw_len = str.length(); + size_t pos = 0; + size_t code_points = 0; + + while (pos != raw_len) { + size_t code_point_len = 1; + code_point_len += static_cast(s[pos] >= 0b11000000); + code_point_len += static_cast(s[pos] >= 0b11100000); + code_point_len += static_cast(s[pos] >= 0b11110000); + if (raw_len - pos < code_point_len) { + return 0; // invalid utf8, avoid going past the end } - - count++; + pos += code_point_len; + code_points += 1; } - return count; + return code_points; } std::string utf8substr(const std::string& str, std::string::size_type start, std::string::size_type leng) From 55a6d2053a549b802d7c3df522806ecbbebae01d Mon Sep 17 00:00:00 2001 From: rept1d Date: Tue, 19 Dec 2023 20:00:38 +0300 Subject: [PATCH 2/4] fix: utf8substr --- include/dpp/utility.h | 9 ++++--- src/dpp/utility.cpp | 59 ++++++++++++++++++++----------------------- 2 files changed, 32 insertions(+), 36 deletions(-) diff --git a/include/dpp/utility.h b/include/dpp/utility.h index 892b76e9eb..510d80c936 100644 --- a/include/dpp/utility.h +++ b/include/dpp/utility.h @@ -747,14 +747,15 @@ std::string DPP_EXPORT debug_dump(uint8_t* data, size_t length); size_t DPP_EXPORT utf8len(std::string_view str); /** - * @brief Return substring of a UTF-8 encoded string in codepoints - * + * @brief Return substring of a UTF-8 encoded string in codepoints. + * @note Result is unspecified for strings that are not valid UTF-8. + * * @param str string to return substring from * @param start start codepoint offset * @param length length in codepoints - * @return std::string Substring in UTF-8 or empty string if invalid UTF-8 passed in + * @return std::string The requested substring */ -std::string DPP_EXPORT utf8substr(const std::string& str, std::string::size_type start, std::string::size_type length); +std::string DPP_EXPORT utf8substr(std::string_view str, size_t start, size_t length); /** * @brief Read a whole file into a std::string. diff --git a/src/dpp/utility.cpp b/src/dpp/utility.cpp index a6a44436ff..6e20303573 100644 --- a/src/dpp/utility.cpp +++ b/src/dpp/utility.cpp @@ -505,44 +505,39 @@ size_t utf8len(std::string_view str) { return code_points; } -std::string utf8substr(const std::string& str, std::string::size_type start, std::string::size_type leng) -{ - if (leng == 0) { - return ""; - } - if (start == 0 && leng >= utf8len(str)) { - return str; - } - std::string::size_type i, ix, q, min = std::string::npos, max = std::string::npos; - for (q = 0, i = 0, ix = str.length(); i < ix; i++, q++) { - if (q == start) { - min = i; +std::string utf8substr(std::string_view str, size_t start, size_t length) { + /* Shouldn't rely on signedness of char, better cast to unsigned char */ + const auto* const s = reinterpret_cast(str.data()); + + const size_t raw_len = str.length(); + size_t pos = 0; + size_t code_points = 0; + + size_t subview_start = raw_len; + size_t subview_len = std::string_view::npos; + + while (pos != raw_len) { + if (code_points == start) { + subview_start = pos; } - if (q <= start + leng || leng == std::string::npos) { - max = i; + if (code_points == start + length) { + subview_len = pos - subview_start; + break; // no point in traversing the remainder of the string } - unsigned char c = (unsigned char)str[i]; - if (c < 0x80) { - i += 0; - } else if ((c & 0xE0) == 0xC0) { - i += 1; - } else if ((c & 0xF0) == 0xE0) { - i += 2; - } else if ((c & 0xF8) == 0xF0) { - i += 3; - } else { - return ""; //invalid utf8 + size_t code_point_len = 1; + code_point_len += static_cast(s[pos] >= 0b11000000); + code_point_len += static_cast(s[pos] >= 0b11100000); + code_point_len += static_cast(s[pos] >= 0b11110000); + + if (raw_len - pos < code_point_len) { + return ""; // invalid utf8, avoid going past the end } - } - if (q <= start + leng || leng == std::string::npos) { - max = i; - } - if (min == std::string::npos || max == std::string::npos) { - return ""; + pos += code_point_len; + code_points += 1; } - return str.substr(min, max); + return std::string(str.substr(subview_start, subview_len)); } std::string read_file(const std::string& filename) From 64433591b6ea6c924eb34d0091e1090bda58d736 Mon Sep 17 00:00:00 2001 From: rept1d Date: Tue, 19 Dec 2023 20:04:00 +0300 Subject: [PATCH 3/4] feat: add and use utf8subview --- include/dpp/utility.h | 12 ++++++++++++ src/dpp/commandhandler.cpp | 2 +- src/dpp/utility.cpp | 8 ++++++-- 3 files changed, 19 insertions(+), 3 deletions(-) diff --git a/include/dpp/utility.h b/include/dpp/utility.h index 510d80c936..47b85f925d 100644 --- a/include/dpp/utility.h +++ b/include/dpp/utility.h @@ -746,6 +746,18 @@ std::string DPP_EXPORT debug_dump(uint8_t* data, size_t length); */ size_t DPP_EXPORT utf8len(std::string_view str); +/** + * @brief Return subview of a UTF-8 encoded string in codepoints. + * @note You must ensure that the resulting view is not used after the lifetime of the viewed string has ended. + * @note Result is unspecified for strings that are not valid UTF-8. + * + * @param str string to return substring from + * @param start start codepoint offset + * @param length length in codepoints + * @return std::string_view The requested subview + */ +std::string_view DPP_EXPORT utf8subview(std::string_view str, size_t start, size_t length); + /** * @brief Return substring of a UTF-8 encoded string in codepoints. * @note Result is unspecified for strings that are not valid UTF-8. diff --git a/src/dpp/commandhandler.cpp b/src/dpp/commandhandler.cpp index 9e15168f53..e337b0eee9 100644 --- a/src/dpp/commandhandler.cpp +++ b/src/dpp/commandhandler.cpp @@ -168,7 +168,7 @@ bool commandhandler::string_has_prefix(std::string &str) { for (auto& p : prefixes) { size_t prefix_length = utility::utf8len(p); - if (utility::utf8substr(str, 0, prefix_length) == p) { + if (utility::utf8subview(str, 0, prefix_length) == p) { str.erase(str.begin(), str.begin() + prefix_length); return true; } diff --git a/src/dpp/utility.cpp b/src/dpp/utility.cpp index 6e20303573..ff610feeb9 100644 --- a/src/dpp/utility.cpp +++ b/src/dpp/utility.cpp @@ -505,7 +505,7 @@ size_t utf8len(std::string_view str) { return code_points; } -std::string utf8substr(std::string_view str, size_t start, size_t length) { +std::string_view utf8subview(std::string_view str, size_t start, size_t length) { /* Shouldn't rely on signedness of char, better cast to unsigned char */ const auto* const s = reinterpret_cast(str.data()); @@ -537,7 +537,11 @@ std::string utf8substr(std::string_view str, size_t start, size_t length) { code_points += 1; } - return std::string(str.substr(subview_start, subview_len)); + return str.substr(subview_start, subview_len); +} + +std::string utf8substr(std::string_view str, size_t start, size_t length) { + return std::string(utf8subview(str, start, length)); } std::string read_file(const std::string& filename) From d99070a6b2aabea6535d92f3b241125a3780e507 Mon Sep 17 00:00:00 2001 From: rept1d Date: Tue, 19 Dec 2023 21:49:26 +0300 Subject: [PATCH 4/4] refactor: get rid of reintepret_cast in utf8 utils --- src/dpp/utility.cpp | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/src/dpp/utility.cpp b/src/dpp/utility.cpp index ff610feeb9..ff0dcde276 100644 --- a/src/dpp/utility.cpp +++ b/src/dpp/utility.cpp @@ -483,22 +483,23 @@ void exec(const std::string& cmd, std::vector parameters, cmd_resul } size_t utf8len(std::string_view str) { - /* Shouldn't rely on signedness of char, better cast to unsigned char */ - const auto* const s = reinterpret_cast(str.data()); - const size_t raw_len = str.length(); size_t pos = 0; size_t code_points = 0; while (pos != raw_len) { + const unsigned char cur = str[pos]; + size_t code_point_len = 1; - code_point_len += static_cast(s[pos] >= 0b11000000); - code_point_len += static_cast(s[pos] >= 0b11100000); - code_point_len += static_cast(s[pos] >= 0b11110000); + code_point_len += static_cast(cur >= 0b11000000); + code_point_len += static_cast(cur >= 0b11100000); + code_point_len += static_cast(cur >= 0b11110000); + if (raw_len - pos < code_point_len) { return 0; // invalid utf8, avoid going past the end } pos += code_point_len; + code_points += 1; } @@ -506,9 +507,6 @@ size_t utf8len(std::string_view str) { } std::string_view utf8subview(std::string_view str, size_t start, size_t length) { - /* Shouldn't rely on signedness of char, better cast to unsigned char */ - const auto* const s = reinterpret_cast(str.data()); - const size_t raw_len = str.length(); size_t pos = 0; size_t code_points = 0; @@ -525,15 +523,18 @@ std::string_view utf8subview(std::string_view str, size_t start, size_t length) break; // no point in traversing the remainder of the string } + const unsigned char cur = str[pos]; + size_t code_point_len = 1; - code_point_len += static_cast(s[pos] >= 0b11000000); - code_point_len += static_cast(s[pos] >= 0b11100000); - code_point_len += static_cast(s[pos] >= 0b11110000); + code_point_len += static_cast(cur >= 0b11000000); + code_point_len += static_cast(cur >= 0b11100000); + code_point_len += static_cast(cur >= 0b11110000); if (raw_len - pos < code_point_len) { return ""; // invalid utf8, avoid going past the end } pos += code_point_len; + code_points += 1; }