From 74cac85f33a6c133728feb432be128fe39770cc8 Mon Sep 17 00:00:00 2001 From: Markus Stange Date: Wed, 17 Jul 2024 13:57:07 -0400 Subject: [PATCH 1/4] Move Cpus to shared dir. --- samply/src/linux_shared/converter.rs | 2 +- samply/src/linux_shared/mod.rs | 1 - samply/src/shared/mod.rs | 1 + samply/src/{linux_shared => shared}/per_cpu.rs | 0 4 files changed, 2 insertions(+), 2 deletions(-) rename samply/src/{linux_shared => shared}/per_cpu.rs (100%) diff --git a/samply/src/linux_shared/converter.rs b/samply/src/linux_shared/converter.rs index 22425c6d..371431fd 100644 --- a/samply/src/linux_shared/converter.rs +++ b/samply/src/linux_shared/converter.rs @@ -35,7 +35,6 @@ use super::injected_jit_object::{correct_bad_perf_jit_so_file, jit_function_name use super::kernel_symbols::{kernel_module_build_id, KernelSymbols}; use super::mmap_range_or_vec::MmapRangeOrVec; use super::pe_mappings::{PeMappings, SuspectedPeMapping}; -use super::per_cpu::Cpus; use super::processes::Processes; use super::rss_stat::{RssStat, MM_ANONPAGES, MM_FILEPAGES, MM_SHMEMPAGES, MM_SWAPENTS}; use super::svma_file_range::compute_vma_bias; @@ -43,6 +42,7 @@ use super::vdso::VdsoObject; use crate::shared::context_switch::{ContextSwitchHandler, OffCpuSampleGroup}; use crate::shared::jit_category_manager::JitCategoryManager; use crate::shared::lib_mappings::{AndroidArtInfo, LibMappingInfo}; +use crate::shared::per_cpu::Cpus; use crate::shared::process_name::make_process_name; use crate::shared::process_sample_data::{ OtherEventMarker, RssStatMarker, RssStatMember, SchedSwitchMarkerOnCpuTrack, diff --git a/samply/src/linux_shared/mod.rs b/samply/src/linux_shared/mod.rs index 4238b1b6..590e070e 100644 --- a/samply/src/linux_shared/mod.rs +++ b/samply/src/linux_shared/mod.rs @@ -7,7 +7,6 @@ mod kernel_symbols; mod mmap_range_or_vec; mod object_rewriter; mod pe_mappings; -mod per_cpu; mod process; mod process_threads; mod processes; diff --git a/samply/src/shared/mod.rs b/samply/src/shared/mod.rs index e33d94f5..ef0a6ef9 100644 --- a/samply/src/shared/mod.rs +++ b/samply/src/shared/mod.rs @@ -7,6 +7,7 @@ pub mod jit_function_recycler; pub mod jitdump_manager; pub mod lib_mappings; pub mod marker_file; +pub mod per_cpu; pub mod perf_map; pub mod process_name; pub mod process_sample_data; diff --git a/samply/src/linux_shared/per_cpu.rs b/samply/src/shared/per_cpu.rs similarity index 100% rename from samply/src/linux_shared/per_cpu.rs rename to samply/src/shared/per_cpu.rs From a8801ced09edf2c96b78f7e849080f2359ff7ac5 Mon Sep 17 00:00:00 2001 From: Markus Stange Date: Wed, 17 Jul 2024 14:02:11 -0400 Subject: [PATCH 2/4] Combine these two calls to add_sample. --- samply/src/windows/profile_context.rs | 37 ++++++++++----------------- 1 file changed, 14 insertions(+), 23 deletions(-) diff --git a/samply/src/windows/profile_context.rs b/samply/src/windows/profile_context.rs index df552033..4b07a7ec 100644 --- a/samply/src/windows/profile_context.rs +++ b/samply/src/windows/profile_context.rs @@ -1272,31 +1272,22 @@ impl ProfileContext { } if let Some(cpu_delta) = on_cpu_sample_cpu_delta { - if let Some(mut combined_stack) = kernel_stack { + let stack_index = if let Some(mut combined_stack) = kernel_stack { combined_stack.extend_from_slice(&user_stack[..]); - let combined_stack_index = self - .unresolved_stacks - .convert(combined_stack.into_iter().rev()); - process.unresolved_samples.add_sample( - thread.handle, - timestamp, - timestamp_raw, - combined_stack_index, - cpu_delta, - 1, - None, - ); + self.unresolved_stacks + .convert(combined_stack.into_iter().rev()) } else { - process.unresolved_samples.add_sample( - thread.handle, - timestamp, - timestamp_raw, - user_stack_index, - cpu_delta, - 1, - None, - ); - } + user_stack_index + }; + process.unresolved_samples.add_sample( + thread.handle, + timestamp, + timestamp_raw, + stack_index, + cpu_delta, + 1, + None, + ); self.stack_sample_count += 1; } } From ec66c8b5a3c388b68eec8c18d63051500a4e0917 Mon Sep 17 00:00:00 2001 From: Markus Stange Date: Wed, 17 Jul 2024 15:29:36 -0400 Subject: [PATCH 3/4] Refactor the sample stack stuff. --- samply/src/windows/profile_context.rs | 218 +++++++++++++++----------- 1 file changed, 123 insertions(+), 95 deletions(-) diff --git a/samply/src/windows/profile_context.rs b/samply/src/windows/profile_context.rs index 4b07a7ec..56c31888 100644 --- a/samply/src/windows/profile_context.rs +++ b/samply/src/windows/profile_context.rs @@ -28,7 +28,9 @@ use crate::shared::recycling::{ProcessRecycler, ProcessRecyclingData, ThreadRecy use crate::shared::synthetic_jit_library::SyntheticJitLibrary; use crate::shared::timestamp_converter::TimestampConverter; use crate::shared::types::{StackFrame, StackMode}; -use crate::shared::unresolved_samples::{UnresolvedSamples, UnresolvedStacks}; +use crate::shared::unresolved_samples::{ + UnresolvedSamples, UnresolvedStackHandle, UnresolvedStacks, +}; use crate::windows::firefox::{ PHASE_INSTANT, PHASE_INTERVAL, PHASE_INTERVAL_END, PHASE_INTERVAL_START, }; @@ -36,13 +38,14 @@ use crate::windows::firefox::{ /// An on- or off-cpu-sample for which the user stack is not known yet. /// Consumed once the user stack arrives. #[derive(Debug, Clone)] -pub struct PendingStack { +pub struct SampleWithPendingStack { /// The timestamp of the SampleProf or CSwitch event pub timestamp: u64, /// Starts out as None. Once we encounter the kernel stack (if any), we put it here. pub kernel_stack: Option>, pub off_cpu_sample_group: Option, - pub on_cpu_sample_cpu_delta: Option, + pub cpu_delta: CpuDelta, + pub has_on_cpu_sample: bool, } #[derive(Debug)] @@ -137,7 +140,7 @@ pub struct Thread { pub is_main_thread: bool, pub handle: ThreadHandle, pub label_frame: FrameInfo, - pub pending_stacks: VecDeque, + pub samples_with_pending_stacks: VecDeque, pub context_switch_data: ThreadContextSwitchData, #[allow(dead_code)] pub thread_id: u32, @@ -161,7 +164,7 @@ impl Thread { is_main_thread, handle, label_frame, - pending_stacks: VecDeque::new(), + samples_with_pending_stacks: VecDeque::new(), context_switch_data: Default::default(), pending_markers: HashMap::new(), thread_id: tid, @@ -1172,17 +1175,17 @@ impl ProfileContext { let Some(thread) = self.threads.get_by_tid(tid) else { return; }; - if let Some(pending_stack) = thread - .pending_stacks + if let Some(sample_info) = thread + .samples_with_pending_stacks .iter_mut() .rev() .find(|s| s.timestamp == timestamp_raw) { - if let Some(kernel_stack) = pending_stack.kernel_stack.as_mut() { + if let Some(kernel_stack) = sample_info.kernel_stack.as_mut() { log::warn!("Multiple kernel stacks for timestamp {timestamp_raw} on thread {tid}"); kernel_stack.extend(&stack); } else { - pending_stack.kernel_stack = Some(stack); + sample_info.kernel_stack = Some(stack); } } } @@ -1192,104 +1195,118 @@ impl ProfileContext { timestamp_raw: u64, pid: u32, tid: u32, - stack: Vec, + user_stack: Vec, ) { - let Some(process) = self.processes.get_by_pid(pid) else { - return; - }; let Some(thread) = self.threads.get_by_tid(tid) else { return; }; - // We now know that we have a user stack. User stacks always come last. Consume - // the pending stack with matching timestamp. - let user_stack = stack; + // User stacks always come last. Consume any samples with pending stacks with matching timestamp. let user_stack_index = self .unresolved_stacks .convert(user_stack.iter().cloned().rev()); // the number of pending stacks at or before our timestamp - let num_pending_stacks = thread - .pending_stacks + let num_samples_with_pending_stacks = thread + .samples_with_pending_stacks .iter() .take_while(|s| s.timestamp <= timestamp_raw) .count(); - let pending_stacks: VecDeque<_> = - thread.pending_stacks.drain(..num_pending_stacks).collect(); + let samples_with_pending_stacks: VecDeque<_> = thread + .samples_with_pending_stacks + .drain(..num_samples_with_pending_stacks) + .collect(); + + let thread_handle = thread.handle; // Use this user stack for all pending stacks from this thread. - for pending_stack in pending_stacks { - let PendingStack { - timestamp: timestamp_raw, - kernel_stack, - off_cpu_sample_group, - on_cpu_sample_cpu_delta, - } = pending_stack; - let timestamp = self.timestamp_converter.convert_time(timestamp_raw); + for sample_info in samples_with_pending_stacks { + self.consume_sample( + pid, + sample_info, + &user_stack, + user_stack_index, + thread_handle, + ); + } + } - if let Some(off_cpu_sample_group) = off_cpu_sample_group { - let OffCpuSampleGroup { - begin_timestamp: begin_timestamp_raw, - end_timestamp: end_timestamp_raw, - sample_count, - } = off_cpu_sample_group; - - let cpu_delta_raw = { - self.context_switch_handler - .consume_cpu_delta(&mut thread.context_switch_data) - }; - let cpu_delta = - CpuDelta::from_nanos(cpu_delta_raw * self.timestamp_converter.raw_to_ns_factor); + fn consume_sample( + &mut self, + pid: u32, + sample_info: SampleWithPendingStack, + user_stack: &[StackFrame], + user_stack_index: UnresolvedStackHandle, + thread_handle: ThreadHandle, + ) { + let Some(process) = self.processes.get_by_pid(pid) else { + return; + }; + let SampleWithPendingStack { + timestamp: timestamp_raw, + kernel_stack, + off_cpu_sample_group, + mut cpu_delta, + has_on_cpu_sample, + } = sample_info; + let timestamp = self.timestamp_converter.convert_time(timestamp_raw); - // Add a sample at the beginning of the paused range. - // This "first sample" will carry any leftover accumulated running time ("cpu delta"). - let begin_timestamp = self.timestamp_converter.convert_time(begin_timestamp_raw); + if let Some(off_cpu_sample_group) = off_cpu_sample_group { + let OffCpuSampleGroup { + begin_timestamp: begin_timestamp_raw, + end_timestamp: end_timestamp_raw, + sample_count, + } = off_cpu_sample_group; + + // Add a sample at the beginning of the paused range. + // This "first sample" will carry any leftover accumulated running time ("cpu delta"). + let begin_timestamp = self.timestamp_converter.convert_time(begin_timestamp_raw); + process.unresolved_samples.add_sample( + thread_handle, + begin_timestamp, + begin_timestamp_raw, + user_stack_index, + cpu_delta, + 1, + None, + ); + cpu_delta = CpuDelta::ZERO; + + if sample_count > 1 { + // Emit a "rest sample" with a CPU delta of zero covering the rest of the paused range. + let weight = i32::try_from(sample_count - 1).unwrap_or(0); + let end_timestamp = self.timestamp_converter.convert_time(end_timestamp_raw); process.unresolved_samples.add_sample( - thread.handle, - begin_timestamp, - begin_timestamp_raw, + thread_handle, + end_timestamp, + end_timestamp_raw, user_stack_index, - cpu_delta, - 1, + CpuDelta::ZERO, + weight, None, ); - - if sample_count > 1 { - // Emit a "rest sample" with a CPU delta of zero covering the rest of the paused range. - let weight = i32::try_from(sample_count - 1).unwrap_or(0); - let end_timestamp = self.timestamp_converter.convert_time(end_timestamp_raw); - process.unresolved_samples.add_sample( - thread.handle, - end_timestamp, - end_timestamp_raw, - user_stack_index, - CpuDelta::ZERO, - weight, - None, - ); - } } + } - if let Some(cpu_delta) = on_cpu_sample_cpu_delta { - let stack_index = if let Some(mut combined_stack) = kernel_stack { - combined_stack.extend_from_slice(&user_stack[..]); - self.unresolved_stacks - .convert(combined_stack.into_iter().rev()) - } else { - user_stack_index - }; - process.unresolved_samples.add_sample( - thread.handle, - timestamp, - timestamp_raw, - stack_index, - cpu_delta, - 1, - None, - ); - self.stack_sample_count += 1; - } + if has_on_cpu_sample { + let stack_index = if let Some(mut combined_stack) = kernel_stack { + combined_stack.extend_from_slice(user_stack); + self.unresolved_stacks + .convert(combined_stack.into_iter().rev()) + } else { + user_stack_index + }; + process.unresolved_samples.add_sample( + thread_handle, + timestamp, + timestamp_raw, + stack_index, + cpu_delta, + 1, + None, + ); + self.stack_sample_count += 1; } } @@ -1305,12 +1322,15 @@ impl ProfileContext { .context_switch_handler .consume_cpu_delta(&mut thread.context_switch_data); let cpu_delta = CpuDelta::from_nanos(delta * self.timestamp_converter.raw_to_ns_factor); - thread.pending_stacks.push_back(PendingStack { - timestamp: timestamp_raw, - kernel_stack: None, - off_cpu_sample_group, - on_cpu_sample_cpu_delta: Some(cpu_delta), - }); + thread + .samples_with_pending_stacks + .push_back(SampleWithPendingStack { + timestamp: timestamp_raw, + kernel_stack: None, + off_cpu_sample_group, + cpu_delta, + has_on_cpu_sample: true, + }); self.sample_count += 1; } @@ -1504,13 +1524,21 @@ impl ProfileContext { let off_cpu_sample_group = self .context_switch_handler .handle_switch_in(timestamp_raw, &mut new_thread.context_switch_data); + let cpu_delta_raw = self + .context_switch_handler + .consume_cpu_delta(&mut new_thread.context_switch_data); + let cpu_delta = + CpuDelta::from_nanos(cpu_delta_raw * self.timestamp_converter.raw_to_ns_factor); if let Some(off_cpu_sample_group) = off_cpu_sample_group { - new_thread.pending_stacks.push_back(PendingStack { - timestamp: timestamp_raw, - kernel_stack: None, - off_cpu_sample_group: Some(off_cpu_sample_group), - on_cpu_sample_cpu_delta: None, - }); + new_thread + .samples_with_pending_stacks + .push_back(SampleWithPendingStack { + timestamp: timestamp_raw, + kernel_stack: None, + off_cpu_sample_group: Some(off_cpu_sample_group), + cpu_delta, + has_on_cpu_sample: false, + }); } } } From bcb65f40546d9e255a0116bebb42715922abe046 Mon Sep 17 00:00:00 2001 From: Markus Stange Date: Wed, 17 Jul 2024 19:48:22 -0400 Subject: [PATCH 4/4] Respect --per-cpu-threads on Windows. --- etw-reader/examples/dump.rs | 5 +- fxprof-processed-profile/src/profile.rs | 6 + fxprof-processed-profile/src/thread.rs | 4 + samply/src/linux_shared/converter.rs | 3 +- samply/src/shared/per_cpu.rs | 21 +- samply/src/shared/timestamp_converter.rs | 7 +- samply/src/shared/unresolved_samples.rs | 9 +- samply/src/windows/etw_gecko.rs | 10 +- samply/src/windows/profile_context.rs | 250 +++++++++++++++++++---- 9 files changed, 258 insertions(+), 57 deletions(-) diff --git a/etw-reader/examples/dump.rs b/etw-reader/examples/dump.rs index faa8aa24..cc843d56 100644 --- a/etw-reader/examples/dump.rs +++ b/etw-reader/examples/dump.rs @@ -35,14 +35,15 @@ fn main() { } } println!( - "{:?} {} {} {}-{} {} {}", + "{:?} {} {} {}-{} {} {}, processor {}", e.EventHeader.ProviderId, s.name(), s.provider_name(), e.EventHeader.EventDescriptor.Opcode, e.EventHeader.EventDescriptor.Id, s.property_count(), - e.EventHeader.TimeStamp + e.EventHeader.TimeStamp, + unsafe { e.BufferContext.Anonymous.ProcessorIndex }, ); println!( "pid: {} {:?}", diff --git a/fxprof-processed-profile/src/profile.rs b/fxprof-processed-profile/src/profile.rs index 42cad6c5..4d5988a1 100644 --- a/fxprof-processed-profile/src/profile.rs +++ b/fxprof-processed-profile/src/profile.rs @@ -393,6 +393,12 @@ impl Profile { self.threads[thread.0].set_end_time(end_time); } + /// Set the tid (thread ID) of a thread. + pub fn set_thread_tid(&mut self, thread: ThreadHandle, tid: u32) { + let tid = self.make_unique_tid(tid); + self.threads[thread.0].set_tid(tid); + } + /// Turn the string into in a [`StringHandle`], for use in [`Frame::Label`]. pub fn intern_string(&mut self, s: &str) -> StringHandle { StringHandle(self.string_table.index_for_string(s)) diff --git a/fxprof-processed-profile/src/thread.rs b/fxprof-processed-profile/src/thread.rs index fb0ed95e..8646134c 100644 --- a/fxprof-processed-profile/src/thread.rs +++ b/fxprof-processed-profile/src/thread.rs @@ -81,6 +81,10 @@ impl Thread { self.end_time = Some(end_time); } + pub fn set_tid(&mut self, tid: String) { + self.tid = tid; + } + pub fn process(&self) -> ProcessHandle { self.process } diff --git a/samply/src/linux_shared/converter.rs b/samply/src/linux_shared/converter.rs index 371431fd..651ce9f0 100644 --- a/samply/src/linux_shared/converter.rs +++ b/samply/src/linux_shared/converter.rs @@ -12,6 +12,7 @@ use fxprof_processed_profile::{ ReferenceTimestamp, SamplingInterval, StaticSchemaMarker, StringHandle, SymbolTable, ThreadHandle, }; +use linux_perf_data::linux_perf_event_reader::TaskWasPreempted; use linux_perf_data::simpleperf_dso_type::{DSO_DEX_FILE, DSO_KERNEL, DSO_KERNEL_MODULE}; use linux_perf_data::{ linux_perf_event_reader, DsoInfo, DsoKey, Endianness, SimpleperfFileRecord, SimpleperfSymbol, @@ -980,7 +981,7 @@ where &self.timestamp_converter, &[cpu.thread_handle, combined_thread], thread.profile_thread, - preempted, + preempted == TaskWasPreempted::Yes, &mut self.profile, ); } diff --git a/samply/src/shared/per_cpu.rs b/samply/src/shared/per_cpu.rs index 93ab3dba..5a052e1a 100644 --- a/samply/src/shared/per_cpu.rs +++ b/samply/src/shared/per_cpu.rs @@ -3,7 +3,6 @@ use fxprof_processed_profile::{ MarkerLocation, MarkerSchema, MarkerTiming, ProcessHandle, Profile, StaticSchemaMarker, StringHandle, ThreadHandle, Timestamp, }; -use linux_perf_data::linux_perf_event_reader::TaskWasPreempted; use crate::shared::context_switch::ThreadContextSwitchData; use crate::shared::timestamp_converter::TimestampConverter; @@ -46,7 +45,7 @@ impl Cpu { let previous_tid = std::mem::replace(&mut self.current_tid, Some((tid, thread_name, timestamp))); if let Some((_previous_tid, previous_thread_name, switch_in_timestamp)) = previous_tid { - // eprintln!("Missing switch-out (noticed during switch-in) on {}: {previous_tid}, {switch_in_timestamp}", self.name); + // eprintln!("Missing switch-out (noticed during switch-in) on {}: {previous_tid}, {switch_in_timestamp}", profile.get_string(self.name)); let start_timestamp = converter.convert_time(switch_in_timestamp); let end_timestamp = converter.convert_time(timestamp); let timing = MarkerTiming::Interval(start_timestamp, end_timestamp); @@ -63,12 +62,12 @@ impl Cpu { #[allow(clippy::too_many_arguments)] pub fn notify_switch_out( &mut self, - tid: i32, + tid: i32, // tid that is being switched away from timestamp: u64, converter: &TimestampConverter, - thread_handles: &[ThreadHandle], - thread_handle: ThreadHandle, - preempted: TaskWasPreempted, + thread_handles: &[ThreadHandle], // for cpu tracks + thread_handle: ThreadHandle, // for thread tracks + preempted: bool, profile: &mut Profile, ) { let previous_tid = self.current_tid.take(); @@ -84,8 +83,8 @@ impl Cpu { ); } let switch_out_reason = match preempted { - TaskWasPreempted::Yes => profile.intern_string("preempted"), - TaskWasPreempted::No => profile.intern_string("blocked"), + true => profile.intern_string("preempted"), + false => profile.intern_string("blocked"), }; profile.add_marker( thread_handle, @@ -96,16 +95,16 @@ impl Cpu { }, ); if previous_tid != tid { - // eprintln!("Missing switch-out (noticed during switch-out) on {}: {previous_tid}, {switch_in_timestamp}", self.name); + // eprintln!("Missing switch-out (noticed during switch-out) on {}: {previous_tid}, {switch_in_timestamp}", profile.get_string(self.name)); // eprintln!( // "Missing switch-in (noticed during switch-out) on {}: {tid}, {timestamp}", - // self.name + // profile.get_string(self.name) // ); } } else { // eprintln!( // "Missing switch-in (noticed during switch-out) on {}: {tid}, {timestamp}", - // self.name + // profile.get_string(self.name) // ); } } diff --git a/samply/src/shared/timestamp_converter.rs b/samply/src/shared/timestamp_converter.rs index f471526f..55452b7a 100644 --- a/samply/src/shared/timestamp_converter.rs +++ b/samply/src/shared/timestamp_converter.rs @@ -1,4 +1,4 @@ -use fxprof_processed_profile::Timestamp; +use fxprof_processed_profile::{CpuDelta, Timestamp}; #[derive(Debug, Clone, Copy)] pub struct TimestampConverter { @@ -15,6 +15,11 @@ impl TimestampConverter { ) } + #[allow(dead_code)] + pub fn convert_cpu_delta(&self, delta_raw: u64) -> CpuDelta { + CpuDelta::from_nanos(delta_raw * self.raw_to_ns_factor) + } + #[allow(unused)] pub fn convert_us(&self, time_us: u64) -> Timestamp { Timestamp::from_nanos_since_reference( diff --git a/samply/src/shared/unresolved_samples.rs b/samply/src/shared/unresolved_samples.rs index eb60cfb2..b8f49d1c 100644 --- a/samply/src/shared/unresolved_samples.rs +++ b/samply/src/shared/unresolved_samples.rs @@ -172,7 +172,14 @@ impl UnresolvedStacks { /// Get the `UnresolvedStackHandle` for a stack. The stack must be ordered from /// caller-most to callee-most ("outside to inside"). pub fn convert(&mut self, frames: impl Iterator) -> UnresolvedStackHandle { - let mut prefix = UnresolvedStackHandle::EMPTY; + self.convert_with_prefix(UnresolvedStackHandle::EMPTY, frames) + } + + pub fn convert_with_prefix( + &mut self, + mut prefix: UnresolvedStackHandle, + frames: impl Iterator, + ) -> UnresolvedStackHandle { for frame in frames { let x = (prefix, frame); let node = *self.stack_lookup.entry(x).or_insert_with(|| { diff --git a/samply/src/windows/etw_gecko.rs b/samply/src/windows/etw_gecko.rs index 10fb461d..7f41e9aa 100644 --- a/samply/src/windows/etw_gecko.rs +++ b/samply/src/windows/etw_gecko.rs @@ -207,7 +207,8 @@ fn process_trace( return; } let tid: u32 = parser.parse("ThreadId"); - context.handle_sample(timestamp_raw, tid); + let cpu = u32::from(unsafe { e.BufferContext.Anonymous.ProcessorIndex }); + context.handle_sample(timestamp_raw, tid, cpu); } "MSNT_SystemTrace/PageFault/DemandZeroFault" => { if !context.is_in_time_range(timestamp_raw) { @@ -218,7 +219,8 @@ fn process_trace( } let tid: u32 = s.thread_id(); - context.handle_sample(timestamp_raw, tid); + let cpu = u32::from(unsafe { e.BufferContext.Anonymous.ProcessorIndex }); + context.handle_sample(timestamp_raw, tid, cpu); } "MSNT_SystemTrace/PageFault/VirtualAlloc" | "MSNT_SystemTrace/PageFault/VirtualFree" => { @@ -316,7 +318,9 @@ fn process_trace( } let old_tid: u32 = parser.parse("OldThreadId"); let new_tid: u32 = parser.parse("NewThreadId"); - context.handle_cswitch(timestamp_raw, old_tid, new_tid); + let cpu = u32::from(unsafe { e.BufferContext.Anonymous.ProcessorIndex }); + let wait_reason: i8 = parser.parse("OldThreadWaitReason"); + context.handle_cswitch(timestamp_raw, old_tid, new_tid, cpu, wait_reason); } "MSNT_SystemTrace/Thread/ReadyThread" => { if !context.is_in_time_range(timestamp_raw) { diff --git a/samply/src/windows/profile_context.rs b/samply/src/windows/profile_context.rs index 56c31888..778a1df4 100644 --- a/samply/src/windows/profile_context.rs +++ b/samply/src/windows/profile_context.rs @@ -21,6 +21,7 @@ use crate::shared::jit_category_manager::{JitCategoryManager, JsFrame}; use crate::shared::jit_function_add_marker::JitFunctionAddMarker; use crate::shared::jit_function_recycler::JitFunctionRecycler; use crate::shared::lib_mappings::{LibMappingAdd, LibMappingInfo, LibMappingOp, LibMappingOpQueue}; +use crate::shared::per_cpu::Cpus; use crate::shared::process_name::make_process_name; use crate::shared::process_sample_data::{ProcessSampleData, UserTimingMarker}; use crate::shared::recording_props::ProfileCreationProps; @@ -46,6 +47,7 @@ pub struct SampleWithPendingStack { pub off_cpu_sample_group: Option, pub cpu_delta: CpuDelta, pub has_on_cpu_sample: bool, + pub per_cpu_stuff: Option<(ThreadHandle, CpuDelta)>, } #[derive(Debug)] @@ -172,6 +174,13 @@ impl Thread { process_id: pid, } } + + pub fn thread_label(&self) -> StringHandle { + match self.label_frame.frame { + Frame::Label(s) => s, + _ => panic!(), + } + } } pub struct Processes { @@ -503,6 +512,8 @@ pub struct ProfileContext { // Time range from the timestamp origin time_range: Option<(Timestamp, Timestamp)>, + + cpus: Option, } impl ProfileContext { @@ -552,6 +563,15 @@ impl ProfileContext { allow_jit_function_recycling, ); + let cpus = if profile_creation_props.create_per_cpu_threads { + Some(Cpus::new( + Timestamp::from_nanos_since_reference(0), + &mut profile, + )) + } else { + None + }; + Self { profile, profile_creation_props, @@ -584,6 +604,7 @@ impl ProfileContext { event_timestamps_are_qpc: false, main_thread_only, time_range, + cpus, } } @@ -870,12 +891,16 @@ impl ProfileContext { timestamp_raw: u64, tid: u32, pid: u32, - name: Option, + mut name: Option, ) { if !self.is_interesting_process(pid, None, None) { return; } + if name.as_deref().is_some_and(|name| name.is_empty()) { + name = None; + } + let timestamp = self.timestamp_converter.convert_time(timestamp_raw); let Some(process) = self.processes.get_by_pid(pid) else { @@ -885,10 +910,11 @@ impl ProfileContext { if !process.seen_main_thread_start { process.seen_main_thread_start = true; let thread_handle = process.main_thread_handle; + let thread_name = name.as_deref().unwrap_or(&process.name); let thread_label_frame = - make_thread_label_frame(&mut self.profile, name.as_deref(), pid, tid); + make_thread_label_frame(&mut self.profile, Some(thread_name), pid, tid); process.main_thread_label_frame = thread_label_frame.clone(); - // self.profile.set_thread_tid(thread_handle, tid); + self.profile.set_thread_tid(thread_handle, tid); let thread = Thread::new(name, true, thread_handle, thread_label_frame, pid, tid); self.threads.add(tid, timestamp_raw, thread); self.thread_handles @@ -940,10 +966,11 @@ impl ProfileContext { if !process.seen_main_thread_start { process.seen_main_thread_start = true; let thread_handle = process.main_thread_handle; + let thread_name = name.as_deref().unwrap_or(&process.name); let thread_label_frame = - make_thread_label_frame(&mut self.profile, name.as_deref(), pid, tid); + make_thread_label_frame(&mut self.profile, Some(thread_name), pid, tid); process.main_thread_label_frame = thread_label_frame.clone(); - // self.profile.set_thread_tid(thread_handle, tid); + self.profile.set_thread_tid(thread_handle, tid); let thread = Thread::new(name, true, thread_handle, thread_label_frame, pid, tid); self.threads.add(tid, timestamp_raw, thread); self.thread_handles @@ -1033,12 +1060,15 @@ impl ProfileContext { if let Some((thread_handle, thread_label_frame)) = thread_recycler.recycle_by_name(&name) { + thread.name = Some(name); thread.handle = thread_handle; thread.label_frame = thread_label_frame; self.thread_handles .insert((tid, timestamp_raw), thread_handle); + return; } } + thread.label_frame = make_thread_label_frame(&mut self.profile, Some(&name), pid, tid); self.profile.set_thread_name(thread.handle, &name); thread.name = Some(name); } @@ -1168,25 +1198,40 @@ impl ProfileContext { fn handle_kernel_stack( &mut self, timestamp_raw: u64, - _pid: u32, + pid: u32, tid: u32, stack: Vec, ) { let Some(thread) = self.threads.get_by_tid(tid) else { return; }; - if let Some(sample_info) = thread + let Some(index) = thread .samples_with_pending_stacks .iter_mut() - .rev() - .find(|s| s.timestamp == timestamp_raw) - { - if let Some(kernel_stack) = sample_info.kernel_stack.as_mut() { - log::warn!("Multiple kernel stacks for timestamp {timestamp_raw} on thread {tid}"); - kernel_stack.extend(&stack); - } else { - sample_info.kernel_stack = Some(stack); - } + .rposition(|s| s.timestamp == timestamp_raw) + else { + return; + }; + let sample_info = &mut thread.samples_with_pending_stacks[index]; + if let Some(kernel_stack) = sample_info.kernel_stack.as_mut() { + log::warn!("Multiple kernel stacks for timestamp {timestamp_raw} on thread {tid}"); + kernel_stack.extend(&stack); + } else { + sample_info.kernel_stack = Some(stack); + } + + if pid == 4 { + // No user stack will arrive. Consume the sample now. + let sample_info = thread.samples_with_pending_stacks.remove(index).unwrap(); + let thread_handle = thread.handle; + let thread_label_frame = thread.label_frame.clone(); + self.consume_sample( + pid, + sample_info, + UnresolvedStackHandle::EMPTY, + thread_handle, + thread_label_frame, + ); } } @@ -1202,9 +1247,7 @@ impl ProfileContext { }; // User stacks always come last. Consume any samples with pending stacks with matching timestamp. - let user_stack_index = self - .unresolved_stacks - .convert(user_stack.iter().cloned().rev()); + let user_stack_index = self.unresolved_stacks.convert(user_stack.into_iter().rev()); // the number of pending stacks at or before our timestamp let num_samples_with_pending_stacks = thread @@ -1219,15 +1262,16 @@ impl ProfileContext { .collect(); let thread_handle = thread.handle; + let thread_label_frame = thread.label_frame.clone(); // Use this user stack for all pending stacks from this thread. for sample_info in samples_with_pending_stacks { self.consume_sample( pid, sample_info, - &user_stack, user_stack_index, thread_handle, + thread_label_frame.clone(), ); } } @@ -1236,9 +1280,9 @@ impl ProfileContext { &mut self, pid: u32, sample_info: SampleWithPendingStack, - user_stack: &[StackFrame], user_stack_index: UnresolvedStackHandle, thread_handle: ThreadHandle, + thread_label_frame: FrameInfo, ) { let Some(process) = self.processes.get_by_pid(pid) else { return; @@ -1249,6 +1293,7 @@ impl ProfileContext { off_cpu_sample_group, mut cpu_delta, has_on_cpu_sample, + per_cpu_stuff, } = sample_info; let timestamp = self.timestamp_converter.convert_time(timestamp_raw); @@ -1289,28 +1334,51 @@ impl ProfileContext { } } - if has_on_cpu_sample { - let stack_index = if let Some(mut combined_stack) = kernel_stack { - combined_stack.extend_from_slice(user_stack); - self.unresolved_stacks - .convert(combined_stack.into_iter().rev()) - } else { - user_stack_index - }; + if !has_on_cpu_sample { + return; + } + + let stack_index = if let Some(kernel_stack) = kernel_stack { + self.unresolved_stacks + .convert_with_prefix(user_stack_index, kernel_stack.into_iter().rev()) + } else { + user_stack_index + }; + process.unresolved_samples.add_sample( + thread_handle, + timestamp, + timestamp_raw, + stack_index, + cpu_delta, + 1, + None, + ); + + if let Some((cpu_thread_handle, cpu_delta)) = per_cpu_stuff { process.unresolved_samples.add_sample( - thread_handle, + cpu_thread_handle, timestamp, timestamp_raw, stack_index, cpu_delta, 1, - None, + Some(thread_label_frame.clone()), + ); + process.unresolved_samples.add_sample( + self.cpus.as_ref().unwrap().combined_thread_handle(), + timestamp, + timestamp_raw, + stack_index, + CpuDelta::ZERO, + 1, + Some(thread_label_frame.clone()), ); - self.stack_sample_count += 1; } + + self.stack_sample_count += 1; } - pub fn handle_sample(&mut self, timestamp_raw: u64, tid: u32) { + pub fn handle_sample(&mut self, timestamp_raw: u64, tid: u32, cpu_index: u32) { let Some(thread) = self.threads.get_by_tid(tid) else { return; }; @@ -1321,7 +1389,31 @@ impl ProfileContext { let delta = self .context_switch_handler .consume_cpu_delta(&mut thread.context_switch_data); - let cpu_delta = CpuDelta::from_nanos(delta * self.timestamp_converter.raw_to_ns_factor); + let cpu_delta = self.timestamp_converter.convert_cpu_delta(delta); + + let per_cpu_stuff = if let Some(cpus) = &mut self.cpus { + let cpu = cpus.get_mut(cpu_index as usize, &mut self.profile); + + let cpu_thread_handle = cpu.thread_handle; + + // Consume idle cpu time. + let _idle_cpu_sample = self + .context_switch_handler + .handle_on_cpu_sample(timestamp_raw, &mut cpu.context_switch_data); + + let cpu_delta = if true { + self.timestamp_converter.convert_cpu_delta( + self.context_switch_handler + .consume_cpu_delta(&mut cpu.context_switch_data), + ) + } else { + CpuDelta::from_nanos(0) + }; + Some((cpu_thread_handle, cpu_delta)) + } else { + None + }; + thread .samples_with_pending_stacks .push_back(SampleWithPendingStack { @@ -1330,6 +1422,7 @@ impl ProfileContext { off_cpu_sample_group, cpu_delta, has_on_cpu_sample: true, + per_cpu_stuff, }); self.sample_count += 1; @@ -1513,13 +1606,45 @@ impl ProfileContext { .add_marker(*gpu_thread, MarkerTiming::Instant(timestamp), VSyncMarker); } - pub fn handle_cswitch(&mut self, timestamp_raw: u64, old_tid: u32, new_tid: u32) { - // println!("CSwitch {} -> {} @ {} on {}", old_tid, old_tid, e.EventHeader.TimeStamp, unsafe { e.BufferContext.Anonymous.ProcessorIndex }); + pub fn handle_cswitch( + &mut self, + timestamp_raw: u64, + old_tid: u32, + new_tid: u32, + cpu_index: u32, + wait_reason: i8, + ) { + // CSwitch events may or may not have stacks. + // If they have stacks, the stack will be the stack of new_tid. + // In other words, if a thread sleeps, the sleeping stack is delivered to us at the end of the sleep, + // once the CPU starts executing the switched-to thread. + // (That's different to e.g. Linux with sched_switch samples, which deliver the stack at the start of the sleep, i.e. just before the switch-out.) if let Some(old_thread) = self.threads.get_by_tid(old_tid) { self.context_switch_handler .handle_switch_out(timestamp_raw, &mut old_thread.context_switch_data); + + if let Some(cpus) = &mut self.cpus { + let combined_thread = cpus.combined_thread_handle(); + let cpu = cpus.get_mut(cpu_index as usize, &mut self.profile); + self.context_switch_handler + .handle_switch_out(timestamp_raw, &mut cpu.context_switch_data); + + // TODO: Find out if this actually is the right way to check whether a thread + // has been pre-empted. + let preempted = wait_reason == 0 || wait_reason == 32; // "Executive" | "WrPreempted" + cpu.notify_switch_out( + old_tid as i32, + timestamp_raw, + &self.timestamp_converter, + &[cpu.thread_handle, combined_thread], + old_thread.handle, + preempted, + &mut self.profile, + ); + } } + if let Some(new_thread) = self.threads.get_by_tid(new_tid) { let off_cpu_sample_group = self .context_switch_handler @@ -1527,8 +1652,7 @@ impl ProfileContext { let cpu_delta_raw = self .context_switch_handler .consume_cpu_delta(&mut new_thread.context_switch_data); - let cpu_delta = - CpuDelta::from_nanos(cpu_delta_raw * self.timestamp_converter.raw_to_ns_factor); + let cpu_delta = self.timestamp_converter.convert_cpu_delta(cpu_delta_raw); if let Some(off_cpu_sample_group) = off_cpu_sample_group { new_thread .samples_with_pending_stacks @@ -1538,8 +1662,58 @@ impl ProfileContext { off_cpu_sample_group: Some(off_cpu_sample_group), cpu_delta, has_on_cpu_sample: false, + per_cpu_stuff: None, }); } + if let Some(cpus) = &mut self.cpus { + let combined_thread = cpus.combined_thread_handle(); + let idle_frame_label = cpus.idle_frame_label(); + let cpu = cpus.get_mut(cpu_index as usize, &mut self.profile); + + if let Some(idle_cpu_sample) = self + .context_switch_handler + .handle_switch_in(timestamp_raw, &mut cpu.context_switch_data) + { + // Add two samples with a stack saying "", with zero weight. + // This will correctly break up the stack chart to show that nothing was running in the idle time. + // This first sample will carry any leftover accumulated running time ("cpu delta"), + // and the second sample is placed at the end of the paused time. + let cpu_delta_raw = self + .context_switch_handler + .consume_cpu_delta(&mut cpu.context_switch_data); + let cpu_delta = self.timestamp_converter.convert_cpu_delta(cpu_delta_raw); + let begin_timestamp = self + .timestamp_converter + .convert_time(idle_cpu_sample.begin_timestamp); + self.profile.add_sample( + cpu.thread_handle, + begin_timestamp, + std::iter::once(idle_frame_label.clone()), + cpu_delta, + 0, + ); + + // Emit a "rest sample" with a CPU delta of zero covering the rest of the paused range. + let end_timestamp = self + .timestamp_converter + .convert_time(idle_cpu_sample.end_timestamp); + self.profile.add_sample( + cpu.thread_handle, + end_timestamp, + std::iter::once(idle_frame_label.clone()), + CpuDelta::from_nanos(0), + 0, + ); + } + cpu.notify_switch_in( + new_tid as i32, + new_thread.thread_label(), + timestamp_raw, + &self.timestamp_converter, + &[cpu.thread_handle, combined_thread], + &mut self.profile, + ); + } } }