Fix: Prevent splitting multi-byte characters in message snippets (#404)

axllent · Dec 10, 2024 · b27a28c · b27a28c
1 parent b1c745f
commit b27a28c
Showing 1 changed file with 27 additions and 2 deletions.
diff --git a/internal/tools/snippets.go b/internal/tools/snippets.go
@@ -26,7 +26,7 @@ func CreateSnippet(text, html string) string {
 			return data
 		}
 
-		return data[0:limit] + "..."
+		return truncate(data, limit) + "..."
 	}
 
 	if text != "" {
@@ -37,8 +37,33 @@ func CreateSnippet(text, html string) string {
 			return text
 		}
 
-		return text[0:limit] + "..."
+		return truncate(text, limit) + "..."
 	}
 
 	return ""
 }
+
+// Truncate a string allowing for multi-byte encoding.
+// Shamelessly borrowed from Tailscale.
+// See https://github.com/tailscale/tailscale/blob/main/util/truncate/truncate.go
+func truncate(s string, n int) string {
+	if n >= len(s) {
+		return s
+	}
+
+	// Back up until we find the beginning of a UTF-8 encoding.
+	for n > 0 && s[n-1]&0xc0 == 0x80 { // 0x10... is a continuation byte
+		n--
+	}
+
+	// If we're at the beginning of a multi-byte encoding, back up one more to
+	// skip it. It's possible the value was already complete, but it's simpler
+	// if we only have to check in one direction.
+	//
+	// Otherwise, we have a single-byte code (0x00... or 0x01...).
+	if n > 0 && s[n-1]&0xc0 == 0xc0 { // 0x11... starts a multibyte encoding
+		n--
+	}
+
+	return s[:n]
+}