Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[crashtracking] Add explicit test for extra children #720

Merged
merged 5 commits into from
Nov 11, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
155 changes: 155 additions & 0 deletions crashtracker/src/collector/api.rs
Original file line number Diff line number Diff line change
Expand Up @@ -593,3 +593,158 @@ fn test_altstack_nouse() -> anyhow::Result<()> {
// OK, we're done
Ok(())
}

#[cfg_attr(miri, ignore)]
#[cfg(target_os = "linux")]
#[test]
fn test_waitall_nohang() -> anyhow::Result<()> {
// This test checks whether the crashtracking implementation can cause malformed `waitall()`
// idioms to hang.
// Consider the following code from the Ruby runtime:
//
// static VALUE
// proc_waitall(VALUE _)
// {
// VALUE result;
// rb_pid_t pid;
// int status;
//
// result = rb_ary_new();
// rb_last_status_clear();
//
// for (pid = -1;;) {
// pid = rb_waitpid(-1, &status, 0);
// if (pid == -1) {
// int e = errno;
// if (e == ECHILD)
// break;
// rb_syserr_fail(e, 0);
// }
// rb_ary_push(result, rb_assoc_new(PIDT2NUM(pid), rb_last_status_get()));
// }
// return result;
// }
//
// The intent here is to wait for all of one's child processes to exit. This is a pretty
// standard operation in multi-process situations, with one important caveat: usually you know
// your children ahead of time and can wait on them in a controlled, intentional matter.
// Previous versions of crashtracking, which spawned long-lived receiver processes, would
// interfere with this
//
// This implements the inner behavior of a test which allows the caller to control which
// options are used.
use crate::StacktraceCollection;
use chrono::Utc;
use ddcommon::Endpoint;

let time = Utc::now().to_rfc3339();
let dir = "/tmp/crashreports/";
let output_url = format!("file://{dir}{time}.txt");

let endpoint = Some(Endpoint::from_slice(&output_url));

let path_to_receiver_binary =
"/tmp/libdatadog/bin/libdatadog-crashtracking-receiver".to_string();
let create_alt_stack = true; // doesn't matter, but most runtimes use it, so take it
let use_alt_stack = true;
let resolve_frames = StacktraceCollection::EnabledWithInprocessSymbols;
let stderr_filename = Some(format!("{dir}/stderr_{time}.txt"));
let stdout_filename = Some(format!("{dir}/stdout_{time}.txt"));
let timeout_ms = 10_000;
let receiver_config = CrashtrackerReceiverConfig::new(
vec![],
vec![],
path_to_receiver_binary,
stderr_filename,
stdout_filename,
)?;
let config = CrashtrackerConfiguration::new(
vec![],
create_alt_stack,
use_alt_stack,
endpoint,
resolve_frames,
timeout_ms,
)?;

let metadata = CrashtrackerMetadata::new(
"libname".to_string(),
"version".to_string(),
"family".to_string(),
vec![],
);

// Since this test ultimately mutates process state, it's done inside of a fork just like the
// other tests of the same ilk.
match unsafe { libc::fork() } {
-1 => {
panic!("Failed to fork");
}
0 => {
// Child process
// This is where the test actually happens!
init(config, receiver_config, metadata)?;

// Now spawn some short-lived child processes.
// Note: it's easy to confirm this test actually works by cranking the sleep duration
// up past the timeout duration. At such a point, the test should fail.
sanchda marked this conversation as resolved.
Show resolved Hide resolved
let mut children = vec![];
let sleep_duration_ms = 100;
let timeout_duration_ms = 150;
for _ in 0..10 {
sanchda marked this conversation as resolved.
Show resolved Hide resolved
match unsafe { libc::fork() } {
-1 => {
panic!("Failed to fork");
}
0 => {
// Grandchild process
std::thread::sleep(std::time::Duration::from_millis(sleep_duration_ms));
std::process::exit(0); // normal exit, since we're testing waitall
}
pid => {
// Parent process
children.push(pid); // unused in this test
}
}
}

// Now, do the equivalent of the waitall loop.
// One caveat is that we do not want to hang the test, so rather than an unbounded
// `waitpid()`, use WNOHANG within a timer loop.
let timeout = std::time::Duration::from_millis(timeout_duration_ms);
let start_time = std::time::Instant::now();
loop {
if start_time.elapsed() > timeout {
eprintln!("Timed out waiting for children to exit");
std::process::exit(-6);
}

// Call waitpid with WNOHANG
let mut status = 0;
let pid = unsafe { libc::waitpid(-1, &mut status, libc::WNOHANG) };
let errno = std::io::Error::last_os_error().raw_os_error().unwrap();

if pid == -1 && errno == libc::ECHILD {
// No more children! Done!
std::process::exit(42);
}
}
}
pid => {
// Parent process
let mut status = 0;
let _ = unsafe { libc::waitpid(pid, &mut status, 0) };

// `status` is not the exit code, gotta unwrap some layers
if libc::WIFEXITED(status) {
let exit_code = libc::WEXITSTATUS(status);
assert_eq!(exit_code, 42, "Child process exited with unexpected status");
} else {
panic!("Child process did not exit normally");
}
}
}

// Confirmed that we have no children, so we're done.
Ok(())
}
Loading