diff --git a/include/dpp/utility.h b/include/dpp/utility.h index 55ec9e83b5..47b85f925d 100644 --- a/include/dpp/utility.h +++ b/include/dpp/utility.h @@ -738,22 +738,36 @@ uint32_t DPP_EXPORT hsl(int h, int s, int l); std::string DPP_EXPORT debug_dump(uint8_t* data, size_t length); /** - * @brief Returns the length of a UTF-8 string in codepoints - * + * @brief Returns the length of a UTF-8 string in codepoints. + * @note Result is unspecified for strings that are not valid UTF-8. + * * @param str string to count length of - * @return size_t length of string (0 for invalid utf8) + * @return size_t Length of string */ -size_t DPP_EXPORT utf8len(const std::string &str); +size_t DPP_EXPORT utf8len(std::string_view str); /** - * @brief Return substring of a UTF-8 encoded string in codepoints - * + * @brief Return subview of a UTF-8 encoded string in codepoints. + * @note You must ensure that the resulting view is not used after the lifetime of the viewed string has ended. + * @note Result is unspecified for strings that are not valid UTF-8. + * + * @param str string to return substring from + * @param start start codepoint offset + * @param length length in codepoints + * @return std::string_view The requested subview + */ +std::string_view DPP_EXPORT utf8subview(std::string_view str, size_t start, size_t length); + +/** + * @brief Return substring of a UTF-8 encoded string in codepoints. + * @note Result is unspecified for strings that are not valid UTF-8. + * * @param str string to return substring from * @param start start codepoint offset * @param length length in codepoints - * @return std::string Substring in UTF-8 or empty string if invalid UTF-8 passed in + * @return std::string The requested substring */ -std::string DPP_EXPORT utf8substr(const std::string& str, std::string::size_type start, std::string::size_type length); +std::string DPP_EXPORT utf8substr(std::string_view str, size_t start, size_t length); /** * @brief Read a whole file into a std::string. diff --git a/src/dpp/commandhandler.cpp b/src/dpp/commandhandler.cpp index 9e15168f53..e337b0eee9 100644 --- a/src/dpp/commandhandler.cpp +++ b/src/dpp/commandhandler.cpp @@ -168,7 +168,7 @@ bool commandhandler::string_has_prefix(std::string &str) { for (auto& p : prefixes) { size_t prefix_length = utility::utf8len(p); - if (utility::utf8substr(str, 0, prefix_length) == p) { + if (utility::utf8subview(str, 0, prefix_length) == p) { str.erase(str.begin(), str.begin() + prefix_length); return true; } diff --git a/src/dpp/utility.cpp b/src/dpp/utility.cpp index 07638d1201..ff0dcde276 100644 --- a/src/dpp/utility.cpp +++ b/src/dpp/utility.cpp @@ -482,83 +482,67 @@ void exec(const std::string& cmd, std::vector parameters, cmd_resul t.detach(); } -size_t utf8len(const std::string &str) -{ - size_t i = 0, iBefore = 0, count = 0; - const char* s = str.c_str(); - if (*s == 0) { - return 0; - } +size_t utf8len(std::string_view str) { + const size_t raw_len = str.length(); + size_t pos = 0; + size_t code_points = 0; - while (s[i] > 0) { -ascii: - i++; - } + while (pos != raw_len) { + const unsigned char cur = str[pos]; - count += i - iBefore; + size_t code_point_len = 1; + code_point_len += static_cast(cur >= 0b11000000); + code_point_len += static_cast(cur >= 0b11100000); + code_point_len += static_cast(cur >= 0b11110000); - while (s[i]) { - if (s[i] > 0) { - iBefore = i; - goto ascii; - } else { - switch (0xF0 & s[i]) { - case 0xE0: - i += 3; - break; - case 0xF0: - i += 4; - break; - default: - i += 2; - break; - } + if (raw_len - pos < code_point_len) { + return 0; // invalid utf8, avoid going past the end } + pos += code_point_len; - count++; + code_points += 1; } - return count; + return code_points; } -std::string utf8substr(const std::string& str, std::string::size_type start, std::string::size_type leng) -{ - if (leng == 0) { - return ""; - } - if (start == 0 && leng >= utf8len(str)) { - return str; - } - std::string::size_type i, ix, q, min = std::string::npos, max = std::string::npos; - for (q = 0, i = 0, ix = str.length(); i < ix; i++, q++) { - if (q == start) { - min = i; +std::string_view utf8subview(std::string_view str, size_t start, size_t length) { + const size_t raw_len = str.length(); + size_t pos = 0; + size_t code_points = 0; + + size_t subview_start = raw_len; + size_t subview_len = std::string_view::npos; + + while (pos != raw_len) { + if (code_points == start) { + subview_start = pos; } - if (q <= start + leng || leng == std::string::npos) { - max = i; + if (code_points == start + length) { + subview_len = pos - subview_start; + break; // no point in traversing the remainder of the string } - unsigned char c = (unsigned char)str[i]; - if (c < 0x80) { - i += 0; - } else if ((c & 0xE0) == 0xC0) { - i += 1; - } else if ((c & 0xF0) == 0xE0) { - i += 2; - } else if ((c & 0xF8) == 0xF0) { - i += 3; - } else { - return ""; //invalid utf8 + const unsigned char cur = str[pos]; + + size_t code_point_len = 1; + code_point_len += static_cast(cur >= 0b11000000); + code_point_len += static_cast(cur >= 0b11100000); + code_point_len += static_cast(cur >= 0b11110000); + + if (raw_len - pos < code_point_len) { + return ""; // invalid utf8, avoid going past the end } + pos += code_point_len; + + code_points += 1; } - if (q <= start + leng || leng == std::string::npos) { - max = i; - } - if (min == std::string::npos || max == std::string::npos) { - return ""; - } - return str.substr(min, max); + return str.substr(subview_start, subview_len); +} + +std::string utf8substr(std::string_view str, size_t start, size_t length) { + return std::string(utf8subview(str, start, length)); } std::string read_file(const std::string& filename)