Skip to content

Commit

Permalink
chore: Revert "fix: replay will send nonsense strings that should be …
Browse files Browse the repository at this point in the history
…captured" (#25364)
  • Loading branch information
oliverb123 authored Oct 3, 2024
1 parent 50f9e4f commit 55f65f8
Show file tree
Hide file tree
Showing 5 changed files with 6 additions and 241 deletions.
4 changes: 2 additions & 2 deletions rust/capture/src/router.rs
Original file line number Diff line number Diff line change
Expand Up @@ -163,8 +163,8 @@ pub fn router<
.layer(axum::middleware::from_fn(track_metrics))
.with_state(state);

// Don't install metrics unless asked.
// Installing a global recorder when capture is used as a library (during tests etc.)
// Don't install metrics unless asked to
// Installing a global recorder when capture is used as a library (during tests etc)
// does not work well.
if metrics {
let recorder_handle = setup_metrics_recorder();
Expand Down
190 changes: 0 additions & 190 deletions rust/capture/src/utils.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
use std::ops::RangeInclusive;

use rand::RngCore;
use uuid::Uuid;

Expand Down Expand Up @@ -38,191 +36,3 @@ pub fn uuid_v7() -> Uuid {

encode_unix_timestamp_millis(now_millis, &bytes)
}

// TODO - at some point, you really ought to just sit down and
// write a state machine
pub fn replace_invalid_hex_escape_strings(
json_str: String,
) -> Result<String, std::string::FromUtf8Error> {
// Consume the String and get its bytes
let mut bytes = json_str.into_bytes();
let len = bytes.len();
let mut i = 0;

const REPLACEMENT: &[u8; 4] = b"FFFD";
const HIGH_SURROGATE_RANGE: RangeInclusive<u16> = 0xD800..=0xDBFF;
const LOW_SURROGATE_RANGE: RangeInclusive<u16> = 0xDC00..=0xDFFF;

let mut escaped = false;
while i < len {
// First, figure out if this byte is the start of an escape sequence,
// and if it is, set the flag and move forward
if bytes[i] == b'\\' {
escaped = !escaped; // Handle e.g. "\\u1234" as not an escape sequence
i += 1;
continue;
}

// If we're entering a hex escape
if escaped && bytes[i] == b'u' {
// Check if there are enough bytes for a Unicode escape sequence
if i + 4 < len {
// Extract the four escape sequence bytes
let mut codepoint_bytes: [u8; 4] = [0; 4];
codepoint_bytes.copy_from_slice(&bytes[i + 1..i + 5]);

// Convert the bytes to a string, then parse it into a u16 to check if it's a valid escape sequence
if let Ok(Ok(codepoint)) =
std::str::from_utf8(&codepoint_bytes).map(|s| u16::from_str_radix(s, 16))
{
if (HIGH_SURROGATE_RANGE).contains(&codepoint) {
// High surrogate without a following low surrogate
if !is_next_low_surrogate(&bytes, i + 5) {
// Replace with 'FFFD' (Unicode replacement character)
codepoint_bytes.copy_from_slice(REPLACEMENT);
} else {
// This is a high surrogate, and the next is a low one, so we should skip over both
// without modification
i += 11; // u + 4 hex digits + \ + u + 4 hex digits = 11 bytes
escaped = false; // And exit the escape state
continue;
}
} else if (LOW_SURROGATE_RANGE).contains(&codepoint) {
// Unpaired low surrogate - we know this, because if it had a preceding high surrogate,
// we would have skipped over it in the previous iteration (above) - replace it
codepoint_bytes.copy_from_slice(REPLACEMENT);
}
// The unhandled else case is that this isn't part of a surrogate pair, so we don't need to do anything
} else {
// if we couldn't parse those 4 bytes as a hex escape code, or couldn't go from that hex escape code to a u16, replace with 'FFFD'
codepoint_bytes.copy_from_slice(REPLACEMENT);
}
bytes[i + 1] = codepoint_bytes[0];
bytes[i + 2] = codepoint_bytes[1];
bytes[i + 3] = codepoint_bytes[2];
bytes[i + 4] = codepoint_bytes[3];
i += 5; // Move past the Unicode escape sequence
escaped = false; // And exit the escape state
continue;
} else {
// Not enough bytes for a Unicode escape sequence, truncate the buffer
// to the 'u', and then append 'FFFD'
bytes.truncate(i + 1);
bytes.extend_from_slice(REPLACEMENT);
break;
}
}

i += 1;
}

// Convert bytes back to String
String::from_utf8(bytes)
}

fn is_next_low_surrogate(bytes: &[u8], start: usize) -> bool {
let len = bytes.len();
// This doesn't need to look backwards - it's only used when we're already
// parsing a high surrogate, and we're looking at the next 6 bytes
if start + 6 <= len && bytes[start] == b'\\' && bytes[start + 1] == b'u' {
let code_point_bytes = &bytes[start + 2..start + 6];
if let Ok(code_point_str) = std::str::from_utf8(code_point_bytes) {
if let Ok(num) = u16::from_str_radix(code_point_str, 16) {
return (0xDC00..=0xDFFF).contains(&num);
}
}
}
false
}

#[cfg(test)]
mod test {

#[test]
pub fn treplace_unpaired_high_surrogate() {
let json_str = r#"{"key":"\uD800"}"#.to_string();
let expected = r#"{"key":"\uFFFD"}"#.to_string();
assert_eq!(
super::replace_invalid_hex_escape_strings(json_str).unwrap(),
expected
);
}

#[test]
pub fn replace_unpaired_low_surrogate() {
let json_str = r#"{"key":"\uDC00"}"#.to_string();
let expected = r#"{"key":"\uFFFD"}"#.to_string();
assert_eq!(
super::replace_invalid_hex_escape_strings(json_str).unwrap(),
expected
);
}

#[test]
pub fn replace_two_unpaired_low_surrogates() {
let json_str = r#"{"key":"\uDC00\uDC00"}"#.to_string();
let expected = r#"{"key":"\uFFFD\uFFFD"}"#.to_string();
assert_eq!(
super::replace_invalid_hex_escape_strings(json_str).unwrap(),
expected
);
}

#[test]
pub fn replace_two_unpaired_high_surrogates() {
let json_str = r#"{"key":"\uD800\uD800"}"#.to_string();
let expected = r#"{"key":"\uFFFD\uFFFD"}"#.to_string();
assert_eq!(
super::replace_invalid_hex_escape_strings(json_str).unwrap(),
expected
);
}

#[test]
pub fn replace_out_of_order_low_high_surrogates() {
let json_str = r#"{"key":"\uDC00\uD800"}"#.to_string();
let expected = r#"{"key":"\uFFFD\uFFFD"}"#.to_string();
assert_eq!(
super::replace_invalid_hex_escape_strings(json_str).unwrap(),
expected
);
}

#[test]
pub fn replace_unfinished_surrogate() {
let json_str = r#"{"key":"\uD800\uDC0"#.to_string();
let expected = r#"{"key":"\uFFFD\uFFFD"#.to_string();
assert_eq!(
super::replace_invalid_hex_escape_strings(json_str).unwrap(),
expected
);
}

#[test]
pub fn test_from_bad_data() {
let string = include_str!("../tests/session_recording_utf_surrogate_console.json");
let replaced = super::replace_invalid_hex_escape_strings(string.to_string()).unwrap();
let _result: serde_json::Value = serde_json::from_str(&replaced).unwrap();
}

#[test]
pub fn test_escaped_escape_sequences() {
let json_str = r#"{"key":"\\uDC00"}"#.to_string();
let expected = r#"{"key":"\\uDC00"}"#.to_string();
assert_eq!(
super::replace_invalid_hex_escape_strings(json_str).unwrap(),
expected
);
}

#[test]
pub fn test_escaped_escaped_escaped_sequences() {
let json_str = r#"{"key":"\\\uDC00"}"#.to_string();
// Ordering is enter escape, find escaped \, enter escape, find invalid hex escape, replace with FFFD
let expected = r#"{"key":"\\\uFFFD"}"#.to_string();
assert_eq!(
super::replace_invalid_hex_escape_strings(json_str).unwrap(),
expected
);
}
}
6 changes: 4 additions & 2 deletions rust/capture/src/v0_endpoint.rs
Original file line number Diff line number Diff line change
Expand Up @@ -26,9 +26,11 @@ use crate::{
};

/// Flexible endpoint that targets wide compatibility with the wide range of requests
/// processed by posthog-events.
/// currently processed by posthog-events (analytics events capture). Replay is out
/// of scope and should be processed on a separate endpoint.
///
/// TODO Because it must accommodate several shapes, it is inefficient in places. A v1 endpoint should be created, that only accepts the BatchedRequest payload shape.
/// Because it must accommodate several shapes, it is inefficient in places. A v1
/// endpoint should be created, that only accepts the BatchedRequest payload shape.
async fn handle_common(
state: &State<router::State>,
InsecureClientIp(ip): &InsecureClientIp,
Expand Down
19 changes: 0 additions & 19 deletions rust/capture/src/v0_request.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@ use uuid::Uuid;
use crate::api::CaptureError;
use crate::prometheus::report_dropped_events;
use crate::token::validate_token;
use crate::utils::replace_invalid_hex_escape_strings;

#[derive(Deserialize, Default)]
pub enum Compression {
Expand Down Expand Up @@ -177,13 +176,6 @@ impl RawRequest {
s
};

let payload = replace_invalid_hex_escape_strings(payload).map_err(|e| {
tracing::error!("failed to replace invalid hex escape strings: {}", e);
CaptureError::RequestDecodingError(String::from(
"invalid body encoding - failed escaping invalid hex sequences",
))
})?;

tracing::debug!(json = payload, "decoded event data");
Ok(serde_json::from_str::<RawRequest>(&payload)?)
}
Expand Down Expand Up @@ -550,15 +542,4 @@ mod tests {
let result = test_deserialize(json);
assert!(result.is_err());
}

#[test]
fn test_bad_utf() {
let the_bytes = include_bytes!("../tests/session_recording_utf_surrogate_console.json");
let the_payload = RawRequest::from_bytes(Bytes::from_static(the_bytes), 25 * 1024 * 1024);

if let Err(e) = &the_payload {
println!("{:?}", e);
}
assert!(the_payload.is_ok());
}
}
28 changes: 0 additions & 28 deletions rust/capture/tests/session_recording_utf_surrogate_console.json

This file was deleted.

0 comments on commit 55f65f8

Please sign in to comment.