Skip to content

Commit

Permalink
fix for darcs sometimes deadlocking when dropping with a small number…
Browse files Browse the repository at this point in the history
… of worker threads
  • Loading branch information
rdfriese committed Oct 24, 2023
1 parent d3a36bf commit c4a186e
Show file tree
Hide file tree
Showing 2 changed files with 21 additions and 12 deletions.
2 changes: 1 addition & 1 deletion src/array/unsafe.rs
Original file line number Diff line number Diff line change
Expand Up @@ -782,7 +782,7 @@ impl<T: Dist> LamellarArray<T> for UnsafeArray<T> {
{
// std::thread::yield_now();
self.inner.data.team.scheduler.exec_task(); //mmight as well do useful work while we wait
if temp_now.elapsed() > Duration::new(600, 0) {
if temp_now.elapsed().as_secs_f64() > *crate::DEADLOCK_TIMEOUT {
//|| first{
println!(
"in array wait_all mype: {:?} cnt: {:?} {:?} {:?}",
Expand Down
31 changes: 20 additions & 11 deletions src/darc.rs
Original file line number Diff line number Diff line change
Expand Up @@ -347,6 +347,7 @@ impl<T> DarcInner<T> {
fn block_on_outstanding(&self, state: DarcMode, extra_cnt: usize) {
self.wait_all();
let mut timer = std::time::Instant::now();
let team = self.team();
while self.dist_cnt.load(Ordering::SeqCst) > 0
|| self.local_cnt.load(Ordering::SeqCst) > 1 + extra_cnt
|| unsafe { self.any_ref_cnt() }
Expand All @@ -355,9 +356,10 @@ impl<T> DarcInner<T> {
self.send_finished();
}
if timer.elapsed().as_secs_f64() > *crate::DEADLOCK_TIMEOUT {
let ref_cnts_slice = unsafe { std::slice::from_raw_parts_mut(self.ref_cnt_addr as *mut usize, self.num_pes) };
println!("[WARNING] - Potential deadlock detected.\n\
The runtime is currently waiting for all remaining references to this distributed object to be dropped.\n\
This objected is likely a {:?} with {:?} remaining local references and {:?} remaining remote references\n\
This objected is likely a {:?} with {:?} remaining local references and {:?} remaining remote references, ref cnts by pe {ref_cnts_slice:?}\n\
An example where this can occur can be found at https://docs.rs/lamellar/latest/lamellar/array/struct.ReadOnlyArray.html#method.into_local_lock\n\
The deadlock timeout can be set via the LAMELLAR_DEADLOCK_TIMEOUT environment variable, the current timeout is {} seconds\n\
To view backtrace set RUST_LIB_BACKTRACE=1\n\
Expand All @@ -382,9 +384,8 @@ impl<T> DarcInner<T> {
// );
timer = std::time::Instant::now();
}
std::thread::yield_now();
team.scheduler.exec_task();
}
let team = self.team();
let mode_refs =
unsafe { std::slice::from_raw_parts_mut(self.mode_addr as *mut u8, self.num_pes) };
unsafe {
Expand All @@ -406,9 +407,10 @@ impl<T> DarcInner<T> {
self.send_finished();
}
if timer.elapsed().as_secs_f64() > *crate::DEADLOCK_TIMEOUT {
let ref_cnts_slice = unsafe { std::slice::from_raw_parts_mut(self.ref_cnt_addr as *mut usize, self.num_pes) };
println!("[WARNING] -- Potential deadlock detected.\n\
The runtime is currently waiting for all remaining references to this distributed object to be dropped.\n\
This objected is likely a {:?} with {:?} remaining local references and {:?} remaining remote references\n\
This objected is likely a {:?} with {:?} remaining local references and {:?} remaining remote references, ref cnts by pe {ref_cnts_slice:?}\n\
An example where this can occur can be found at https://docs.rs/lamellar/latest/lamellar/array/struct.ReadOnlyArray.html#method.into_local_lock\n\
The deadlock timeout can be set via the LAMELLAR_DEADLOCK_TIMEOUT environment variable, the current timeout is {} seconds\n\
To view backtrace set RUST_LIB_BACKTRACE=1\n\
Expand All @@ -423,7 +425,7 @@ impl<T> DarcInner<T> {
);
timer = std::time::Instant::now();
}
std::thread::yield_now();
team.scheduler.exec_task();
}
}
while self.dist_cnt.load(Ordering::SeqCst) != 0
Expand All @@ -434,9 +436,10 @@ impl<T> DarcInner<T> {
self.send_finished();
}
if timer.elapsed().as_secs_f64() > *crate::DEADLOCK_TIMEOUT {
let ref_cnts_slice = unsafe { std::slice::from_raw_parts_mut(self.ref_cnt_addr as *mut usize, self.num_pes) };
println!("[WARNING] --- Potential deadlock detected.\n\
The runtime is currently waiting for all remaining references to this distributed object to be dropped.\n\
This objected is likely a {:?} with {:?} remaining local references and {:?} remaining remote references\n\
This objected is likely a {:?} with {:?} remaining local references and {:?} remaining remote references, ref cnts by pe {ref_cnts_slice:?}\n\
An example where this can occur can be found at https://docs.rs/lamellar/latest/lamellar/array/struct.ReadOnlyArray.html#method.into_local_lock\n\
The deadlock timeout can be set via the LAMELLAR_DEADLOCK_TIMEOUT environment variable, the current timeout is {} seconds\n\
To view backtrace set RUST_LIB_BACKTRACE=1\n\
Expand All @@ -451,7 +454,7 @@ impl<T> DarcInner<T> {
);
timer = std::time::Instant::now();
}
std::thread::yield_now();
team.scheduler.exec_task();
}
// println!("{:?}",self);
self.team().barrier();
Expand All @@ -465,7 +468,7 @@ impl<T> DarcInner<T> {
while am_counters.outstanding_reqs.load(Ordering::SeqCst) > 0 {
// std::thread::yield_now();
team.scheduler.exec_task(); //mmight as well do useful work while we wait
if temp_now.elapsed() > Duration::new(600, 0) {
if temp_now.elapsed().as_secs_f64() > *crate::DEADLOCK_TIMEOUT {
//|| first{
println!(
"in darc wait_all mype: {:?} cnt: {:?} {:?}",
Expand Down Expand Up @@ -915,9 +918,11 @@ impl<T: 'static> LamellarAM for DroppedWaitAM<T> {
wrapped.inner.as_ref().send_finished();
}
if timeout.elapsed().as_secs_f64() > *crate::DEADLOCK_TIMEOUT {
let ref_cnts_slice = unsafe { std::slice::from_raw_parts_mut(wrapped.inner.as_ref().ref_cnt_addr as *mut usize, wrapped.inner.as_ref().num_pes) };

println!("[WARNING] - Potential deadlock detected when trying to free distributed object.\n\
The runtime is currently waiting for all remaining references to this distributed object to be dropped.\n\
The current status of the object on each pe is {:?} with {:?} remaining local references and {:?} remaining remote references\n\
The current status of the object on each pe is {:?} with {:?} remaining local references and {:?} remaining remote references, ref cnts by pe {ref_cnts_slice:?}\n\
the deadlock timeout can be set via the LAMELLAR_DEADLOCK_TIMEOUT environment variable, the current timeout is {} seconds\n\
To view backtrace set RUST_LIB_BACKTRACE=1\n\
{}",
Expand Down Expand Up @@ -952,9 +957,11 @@ impl<T: 'static> LamellarAM for DroppedWaitAM<T> {
}
}
if timeout.elapsed().as_secs_f64() > *crate::DEADLOCK_TIMEOUT {
let ref_cnts_slice = unsafe { std::slice::from_raw_parts_mut(wrapped.inner.as_ref().ref_cnt_addr as *mut usize, wrapped.inner.as_ref().num_pes) };

println!("[WARNING] -- Potential deadlock detected when trying to free distributed object.\n\
The runtime is currently waiting for all remaining references to this distributed object to be dropped.\n\
The current status of the object on each pe is {:?} with {:?} remaining local references and {:?} remaining remote references\n\
The current status of the object on each pe is {:?} with {:?} remaining local references and {:?} remaining remote references, ref cnts by pe {ref_cnts_slice:?}\n\
the deadlock timeout can be set via the LAMELLAR_DEADLOCK_TIMEOUT environment variable, the current timeout is {} seconds\n\
To view backtrace set RUST_LIB_BACKTRACE=1\n\
{}",
Expand Down Expand Up @@ -984,9 +991,11 @@ impl<T: 'static> LamellarAM for DroppedWaitAM<T> {
wrapped.inner.as_ref().send_finished();
}
if timeout.elapsed().as_secs_f64() > *crate::DEADLOCK_TIMEOUT {
let ref_cnts_slice = unsafe { std::slice::from_raw_parts_mut(wrapped.inner.as_ref().ref_cnt_addr as *mut usize, wrapped.inner.as_ref().num_pes) };

println!("[WARNING] --- Potential deadlock detected when trying to free distributed object.\n\
The runtime is currently waiting for all remaining references to this distributed object to be dropped.\n\
The current status of the object on each pe is {:?} with {:?} remaining local references and {:?} remaining remote references\n\
The current status of the object on each pe is {:?} with {:?} remaining local references and {:?} remaining remote references, ref cnts by pe {ref_cnts_slice:?}\n\
the deadlock timeout can be set via the LAMELLAR_DEADLOCK_TIMEOUT environment variable, the current timeout is {} seconds\n\
To view backtrace set RUST_LIB_BACKTRACE=1\n\
{}",
Expand Down

0 comments on commit c4a186e

Please sign in to comment.