diff --git a/rust/capture/src/router.rs b/rust/capture/src/router.rs index ce9221bfce49e..011bb49d67502 100644 --- a/rust/capture/src/router.rs +++ b/rust/capture/src/router.rs @@ -163,8 +163,8 @@ pub fn router< .layer(axum::middleware::from_fn(track_metrics)) .with_state(state); - // Don't install metrics unless asked. - // Installing a global recorder when capture is used as a library (during tests etc.) + // Don't install metrics unless asked to + // Installing a global recorder when capture is used as a library (during tests etc) // does not work well. if metrics { let recorder_handle = setup_metrics_recorder(); diff --git a/rust/capture/src/utils.rs b/rust/capture/src/utils.rs index 48c069b7bac56..c5a95fbe2481f 100644 --- a/rust/capture/src/utils.rs +++ b/rust/capture/src/utils.rs @@ -1,5 +1,3 @@ -use std::ops::RangeInclusive; - use rand::RngCore; use uuid::Uuid; @@ -38,191 +36,3 @@ pub fn uuid_v7() -> Uuid { encode_unix_timestamp_millis(now_millis, &bytes) } - -// TODO - at some point, you really ought to just sit down and -// write a state machine -pub fn replace_invalid_hex_escape_strings( - json_str: String, -) -> Result { - // Consume the String and get its bytes - let mut bytes = json_str.into_bytes(); - let len = bytes.len(); - let mut i = 0; - - const REPLACEMENT: &[u8; 4] = b"FFFD"; - const HIGH_SURROGATE_RANGE: RangeInclusive = 0xD800..=0xDBFF; - const LOW_SURROGATE_RANGE: RangeInclusive = 0xDC00..=0xDFFF; - - let mut escaped = false; - while i < len { - // First, figure out if this byte is the start of an escape sequence, - // and if it is, set the flag and move forward - if bytes[i] == b'\\' { - escaped = !escaped; // Handle e.g. "\\u1234" as not an escape sequence - i += 1; - continue; - } - - // If we're entering a hex escape - if escaped && bytes[i] == b'u' { - // Check if there are enough bytes for a Unicode escape sequence - if i + 4 < len { - // Extract the four escape sequence bytes - let mut codepoint_bytes: [u8; 4] = [0; 4]; - codepoint_bytes.copy_from_slice(&bytes[i + 1..i + 5]); - - // Convert the bytes to a string, then parse it into a u16 to check if it's a valid escape sequence - if let Ok(Ok(codepoint)) = - std::str::from_utf8(&codepoint_bytes).map(|s| u16::from_str_radix(s, 16)) - { - if (HIGH_SURROGATE_RANGE).contains(&codepoint) { - // High surrogate without a following low surrogate - if !is_next_low_surrogate(&bytes, i + 5) { - // Replace with 'FFFD' (Unicode replacement character) - codepoint_bytes.copy_from_slice(REPLACEMENT); - } else { - // This is a high surrogate, and the next is a low one, so we should skip over both - // without modification - i += 11; // u + 4 hex digits + \ + u + 4 hex digits = 11 bytes - escaped = false; // And exit the escape state - continue; - } - } else if (LOW_SURROGATE_RANGE).contains(&codepoint) { - // Unpaired low surrogate - we know this, because if it had a preceding high surrogate, - // we would have skipped over it in the previous iteration (above) - replace it - codepoint_bytes.copy_from_slice(REPLACEMENT); - } - // The unhandled else case is that this isn't part of a surrogate pair, so we don't need to do anything - } else { - // if we couldn't parse those 4 bytes as a hex escape code, or couldn't go from that hex escape code to a u16, replace with 'FFFD' - codepoint_bytes.copy_from_slice(REPLACEMENT); - } - bytes[i + 1] = codepoint_bytes[0]; - bytes[i + 2] = codepoint_bytes[1]; - bytes[i + 3] = codepoint_bytes[2]; - bytes[i + 4] = codepoint_bytes[3]; - i += 5; // Move past the Unicode escape sequence - escaped = false; // And exit the escape state - continue; - } else { - // Not enough bytes for a Unicode escape sequence, truncate the buffer - // to the 'u', and then append 'FFFD' - bytes.truncate(i + 1); - bytes.extend_from_slice(REPLACEMENT); - break; - } - } - - i += 1; - } - - // Convert bytes back to String - String::from_utf8(bytes) -} - -fn is_next_low_surrogate(bytes: &[u8], start: usize) -> bool { - let len = bytes.len(); - // This doesn't need to look backwards - it's only used when we're already - // parsing a high surrogate, and we're looking at the next 6 bytes - if start + 6 <= len && bytes[start] == b'\\' && bytes[start + 1] == b'u' { - let code_point_bytes = &bytes[start + 2..start + 6]; - if let Ok(code_point_str) = std::str::from_utf8(code_point_bytes) { - if let Ok(num) = u16::from_str_radix(code_point_str, 16) { - return (0xDC00..=0xDFFF).contains(&num); - } - } - } - false -} - -#[cfg(test)] -mod test { - - #[test] - pub fn treplace_unpaired_high_surrogate() { - let json_str = r#"{"key":"\uD800"}"#.to_string(); - let expected = r#"{"key":"\uFFFD"}"#.to_string(); - assert_eq!( - super::replace_invalid_hex_escape_strings(json_str).unwrap(), - expected - ); - } - - #[test] - pub fn replace_unpaired_low_surrogate() { - let json_str = r#"{"key":"\uDC00"}"#.to_string(); - let expected = r#"{"key":"\uFFFD"}"#.to_string(); - assert_eq!( - super::replace_invalid_hex_escape_strings(json_str).unwrap(), - expected - ); - } - - #[test] - pub fn replace_two_unpaired_low_surrogates() { - let json_str = r#"{"key":"\uDC00\uDC00"}"#.to_string(); - let expected = r#"{"key":"\uFFFD\uFFFD"}"#.to_string(); - assert_eq!( - super::replace_invalid_hex_escape_strings(json_str).unwrap(), - expected - ); - } - - #[test] - pub fn replace_two_unpaired_high_surrogates() { - let json_str = r#"{"key":"\uD800\uD800"}"#.to_string(); - let expected = r#"{"key":"\uFFFD\uFFFD"}"#.to_string(); - assert_eq!( - super::replace_invalid_hex_escape_strings(json_str).unwrap(), - expected - ); - } - - #[test] - pub fn replace_out_of_order_low_high_surrogates() { - let json_str = r#"{"key":"\uDC00\uD800"}"#.to_string(); - let expected = r#"{"key":"\uFFFD\uFFFD"}"#.to_string(); - assert_eq!( - super::replace_invalid_hex_escape_strings(json_str).unwrap(), - expected - ); - } - - #[test] - pub fn replace_unfinished_surrogate() { - let json_str = r#"{"key":"\uD800\uDC0"#.to_string(); - let expected = r#"{"key":"\uFFFD\uFFFD"#.to_string(); - assert_eq!( - super::replace_invalid_hex_escape_strings(json_str).unwrap(), - expected - ); - } - - #[test] - pub fn test_from_bad_data() { - let string = include_str!("../tests/session_recording_utf_surrogate_console.json"); - let replaced = super::replace_invalid_hex_escape_strings(string.to_string()).unwrap(); - let _result: serde_json::Value = serde_json::from_str(&replaced).unwrap(); - } - - #[test] - pub fn test_escaped_escape_sequences() { - let json_str = r#"{"key":"\\uDC00"}"#.to_string(); - let expected = r#"{"key":"\\uDC00"}"#.to_string(); - assert_eq!( - super::replace_invalid_hex_escape_strings(json_str).unwrap(), - expected - ); - } - - #[test] - pub fn test_escaped_escaped_escaped_sequences() { - let json_str = r#"{"key":"\\\uDC00"}"#.to_string(); - // Ordering is enter escape, find escaped \, enter escape, find invalid hex escape, replace with FFFD - let expected = r#"{"key":"\\\uFFFD"}"#.to_string(); - assert_eq!( - super::replace_invalid_hex_escape_strings(json_str).unwrap(), - expected - ); - } -} diff --git a/rust/capture/src/v0_endpoint.rs b/rust/capture/src/v0_endpoint.rs index 88069bdf150d7..e21e3a5c94520 100644 --- a/rust/capture/src/v0_endpoint.rs +++ b/rust/capture/src/v0_endpoint.rs @@ -26,9 +26,11 @@ use crate::{ }; /// Flexible endpoint that targets wide compatibility with the wide range of requests -/// processed by posthog-events. +/// currently processed by posthog-events (analytics events capture). Replay is out +/// of scope and should be processed on a separate endpoint. /// -/// TODO Because it must accommodate several shapes, it is inefficient in places. A v1 endpoint should be created, that only accepts the BatchedRequest payload shape. +/// Because it must accommodate several shapes, it is inefficient in places. A v1 +/// endpoint should be created, that only accepts the BatchedRequest payload shape. async fn handle_common( state: &State, InsecureClientIp(ip): &InsecureClientIp, diff --git a/rust/capture/src/v0_request.rs b/rust/capture/src/v0_request.rs index 328be8ee1905a..e3ea92c7ae53e 100644 --- a/rust/capture/src/v0_request.rs +++ b/rust/capture/src/v0_request.rs @@ -14,7 +14,6 @@ use uuid::Uuid; use crate::api::CaptureError; use crate::prometheus::report_dropped_events; use crate::token::validate_token; -use crate::utils::replace_invalid_hex_escape_strings; #[derive(Deserialize, Default)] pub enum Compression { @@ -177,13 +176,6 @@ impl RawRequest { s }; - let payload = replace_invalid_hex_escape_strings(payload).map_err(|e| { - tracing::error!("failed to replace invalid hex escape strings: {}", e); - CaptureError::RequestDecodingError(String::from( - "invalid body encoding - failed escaping invalid hex sequences", - )) - })?; - tracing::debug!(json = payload, "decoded event data"); Ok(serde_json::from_str::(&payload)?) } @@ -550,15 +542,4 @@ mod tests { let result = test_deserialize(json); assert!(result.is_err()); } - - #[test] - fn test_bad_utf() { - let the_bytes = include_bytes!("../tests/session_recording_utf_surrogate_console.json"); - let the_payload = RawRequest::from_bytes(Bytes::from_static(the_bytes), 25 * 1024 * 1024); - - if let Err(e) = &the_payload { - println!("{:?}", e); - } - assert!(the_payload.is_ok()); - } } diff --git a/rust/capture/tests/session_recording_utf_surrogate_console.json b/rust/capture/tests/session_recording_utf_surrogate_console.json deleted file mode 100644 index 7f33bf42717f4..0000000000000 --- a/rust/capture/tests/session_recording_utf_surrogate_console.json +++ /dev/null @@ -1,28 +0,0 @@ -{ - "event": "$snapshot", - "properties": { - "$snapshot_data": [ - { - "windowId": "01924ccf-34f9-764e-b7f9-73c74eb7ed55", - "data": { - "payload": { - "level": "warn", - "payload": ["\\\\\\\",\\\\\\\"emoji_flag\\\\\\\":\\\\\\\"\ud83c...[truncated]", "\"test\""], - "trace": [ - "q/< (https://internal-t.posthog.com/static/recorder.js?v=1.166.0:1:19808)", - "q (https://internal-t.posthog.com/static/recorder.js?v=1.166.0:1:20042)", - "z (https://internal-t.posthog.com/static/recorder.js?v=1.166.0:1:21009)", - "a (https://internal-t.posthog.com/static/recorder.js?v=1.166.0:1:34064)", - "e/this.emit (https://internal-t.posthog.com/static/recorder.js?v=1.166.0:1:35299)", - "e/this.processMutations (https://internal-t.posthog.com/static/recorder.js?v=1.166.0:1:33691)" - ] - }, - "plugin": "rrweb/console@1" - }, - "timestamp": 1727865503680, - "type": 6, - "seen": 1185537021728171 - } - ] - } -}