Skip to content

Commit

Permalink
[PROF-10656] Add faulting_address to crashtracker reports (#663)
Browse files Browse the repository at this point in the history
* [PROF-10656] Add crash_address to crashtracker reports

**What does this PR do?**

While investigating a customer crash, I found that I dearly missed
having the address that caused a segfault.

E.g. given this code:

```c
  char *potato = 0x1234;
  fprintf(stderr, "Value of potato is %c\n", *potato);
```

...I want to know that the crash was triggered when 0x1234 was
dereferenced, as this information may be key in debugging the issue.

We already get this information in our signal handler, so this PR
just adds the needed piping to report it to datadog.

**Motivation:**

Improve debugging of crashes reported by crashtracker.

**Additional Notes:**

I added the `crash_address` as a tag, similar to `siginfo` and
`signame`. We may want to send it as some other thing; no strong
opinions there.

**How to test the change?**

I was able to test this with the Ruby profiler. I've been trying
to update the tests but there's still some failing and I'm kinda
struggling with that. Hopefully someone with a bit more rust-fu than me
can help out there.

* Update tests

* Avoid allocating pretty-printed string for crash_adddress inside signal handler

Instead, handle all the formatting on the final output.

* Rename `crash_address` => `faulting_address`, as suggested in PR review

* Minor: Move read of `sig_info` inside `handle_posix_signal_impl`

On the off chance we ever ran into any issues here, let's make sure to
run this bit of code AFTER we go through the `NUM_TIMES_CALLED` check.

* Minor: Use `if let` instead of `match`
  • Loading branch information
ivoanjo authored Oct 28, 2024
1 parent 085a91a commit 22c2da5
Show file tree
Hide file tree
Showing 6 changed files with 52 additions and 10 deletions.
6 changes: 4 additions & 2 deletions bin_tests/tests/crashtracker_bin_test.rs
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,8 @@ fn test_crash_tracking_bin(
assert_eq!(
serde_json::json!({
"signum": 11,
"signame": "SIGSEGV"
"signame": "SIGSEGV",
"faulting_address": 0,
}),
crash_payload["siginfo"]
);
Expand Down Expand Up @@ -146,7 +147,8 @@ fn assert_telemetry_message(crash_telemetry: &[u8]) {
"profiler_collecting_sample:1",
"profiler_inactive:0",
"profiler_serializing:0",
"signame:SIGSEGV"
"signame:SIGSEGV",
"faulting_address:0x0000000000000000",
]),
tags
);
Expand Down
7 changes: 6 additions & 1 deletion crashtracker-ffi/src/crash_info/datatypes.rs
Original file line number Diff line number Diff line change
Expand Up @@ -237,7 +237,12 @@ impl<'a> TryFrom<SigInfo<'a>> for datadog_crashtracker::SigInfo {
fn try_from(value: SigInfo<'a>) -> Result<Self, Self::Error> {
let signum = value.signum;
let signame = option_from_char_slice(value.signame)?;
Ok(Self { signum, signame })
let faulting_address = None; // TODO: Expose this to FFI
Ok(Self {
signum,
signame,
faulting_address,
})
}
}

Expand Down
21 changes: 18 additions & 3 deletions crashtracker/src/collector/crash_handler.rs
Original file line number Diff line number Diff line change
Expand Up @@ -254,7 +254,7 @@ pub fn shutdown_receiver() -> anyhow::Result<()> {
extern "C" fn handle_posix_sigaction(signum: i32, sig_info: *mut siginfo_t, ucontext: *mut c_void) {
// Handle the signal. Note this has a guard to ensure that we only generate
// one crash report per process.
let _ = handle_posix_signal_impl(signum);
let _ = handle_posix_signal_impl(signum, sig_info);

// Once we've handled the signal, chain to any previous handlers.
// SAFETY: This was created by [register_crash_handlers]. There is a tiny
Expand Down Expand Up @@ -303,7 +303,7 @@ extern "C" fn handle_posix_sigaction(signum: i32, sig_info: *mut siginfo_t, ucon
};
}

fn handle_posix_signal_impl(signum: i32) -> anyhow::Result<()> {
fn handle_posix_signal_impl(signum: i32, sig_info: *mut siginfo_t) -> anyhow::Result<()> {
static NUM_TIMES_CALLED: AtomicU64 = AtomicU64::new(0);
if NUM_TIMES_CALLED.fetch_add(1, SeqCst) > 0 {
// In the case where some lower-level signal handler recovered the error
Expand All @@ -324,13 +324,27 @@ fn handle_posix_signal_impl(signum: i32) -> anyhow::Result<()> {
anyhow::ensure!(!metadata_ptr.is_null(), "No crashtracking metadata");
let (_metadata, metadata_string) = unsafe { metadata_ptr.as_ref().context("metadata ptr")? };

let faulting_address: Option<usize> =
if !sig_info.is_null() && (signum == libc::SIGSEGV || signum == libc::SIGBUS) {
unsafe { Some((*sig_info).si_addr() as usize) }
} else {
None
};

match unsafe { receiver.as_mut().context("No crashtracking receiver")? } {
ReceiverType::ForkedProcess(child) => {
let pipe = child
.stdin
.as_mut()
.context("Crashtracker: Can't get pipe")?;
let res = emit_crashreport(pipe, config, config_str, metadata_string, signum);
let res = emit_crashreport(
pipe,
config,
config_str,
metadata_string,
signum,
faulting_address,
);
let _ = pipe.flush();
if config.wait_for_receiver {
// https://doc.rust-lang.org/std/process/struct.Child.html#method.wait
Expand Down Expand Up @@ -362,6 +376,7 @@ fn handle_posix_signal_impl(signum: i32) -> anyhow::Result<()> {
config_str,
metadata_string,
signum,
faulting_address,
);
let _ = unix_stream.flush();
unix_stream
Expand Down
18 changes: 15 additions & 3 deletions crashtracker/src/collector/emitters.rs
Original file line number Diff line number Diff line change
Expand Up @@ -101,10 +101,11 @@ pub(crate) fn emit_crashreport(
config_str: &str,
metadata_string: &str,
signum: i32,
faulting_address: Option<usize>,
) -> anyhow::Result<()> {
emit_metadata(pipe, metadata_string)?;
emit_config(pipe, config_str)?;
emit_siginfo(pipe, signum)?;
emit_siginfo(pipe, signum, faulting_address)?;
emit_procinfo(pipe)?;
pipe.flush()?;
emit_counters(pipe)?;
Expand Down Expand Up @@ -163,7 +164,11 @@ fn emit_proc_self_maps(w: &mut impl Write) -> anyhow::Result<()> {
Ok(())
}

fn emit_siginfo(w: &mut impl Write, signum: i32) -> anyhow::Result<()> {
fn emit_siginfo(
w: &mut impl Write,
signum: i32,
faulting_address: Option<usize>,
) -> anyhow::Result<()> {
let signame = if signum == libc::SIGSEGV {
"SIGSEGV"
} else if signum == libc::SIGBUS {
Expand All @@ -173,7 +178,14 @@ fn emit_siginfo(w: &mut impl Write, signum: i32) -> anyhow::Result<()> {
};

writeln!(w, "{DD_CRASHTRACK_BEGIN_SIGINFO}")?;
writeln!(w, "{{\"signum\": {signum}, \"signame\": \"{signame}\"}}")?;
if let Some(addr) = faulting_address {
writeln!(
w,
"{{\"signum\": {signum}, \"signame\": \"{signame}\", \"faulting_address\": {addr}}}"
)?;
} else {
writeln!(w, "{{\"signum\": {signum}, \"signame\": \"{signame}\"}}")?;
};
writeln!(w, "{DD_CRASHTRACK_END_SIGINFO}")?;
Ok(())
}
Expand Down
3 changes: 3 additions & 0 deletions crashtracker/src/crash_info/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,9 @@ pub struct SigInfo {
#[serde(skip_serializing_if = "Option::is_none")]
#[serde(default)]
pub signame: Option<String>,
#[serde(skip_serializing_if = "Option::is_none")]
#[serde(default)]
pub faulting_address: Option<usize>,
}

#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
Expand Down
7 changes: 6 additions & 1 deletion crashtracker/src/crash_info/telemetry.rs
Original file line number Diff line number Diff line change
Expand Up @@ -191,6 +191,9 @@ fn extract_crash_info_tags(crash_info: &CrashInfo) -> anyhow::Result<String> {
if let Some(signame) = &siginfo.signame {
write!(&mut tags, ",signame:{}", signame)?;
}
if let Some(faulting_address) = &siginfo.faulting_address {
write!(&mut tags, ",faulting_address:{:#018x}", faulting_address)?;
}
}
for (counter, value) in &crash_info.counters {
write!(&mut tags, ",{}:{}", counter, value)?;
Expand Down Expand Up @@ -276,6 +279,7 @@ mod tests {
siginfo: Some(SigInfo {
signum: 11,
signame: Some("SIGSEGV".to_owned()),
faulting_address: Some(0x1234),
}),
proc_info: None,
stacktrace: vec![],
Expand Down Expand Up @@ -308,7 +312,8 @@ mod tests {
"signum:11",
"signame:SIGSEGV",
"collecting_sample:1",
"not_profiling:0"
"not_profiling:0",
"faulting_address:0x0000000000001234",
]),
tags
);
Expand Down

0 comments on commit 22c2da5

Please sign in to comment.