From c4a186eab09a7eac24bcacd30ee0ebe0df964805 Mon Sep 17 00:00:00 2001 From: "Ryan D. Friese" Date: Tue, 24 Oct 2023 14:36:42 -0700 Subject: [PATCH] fix for darcs sometimes deadlocking when dropping with a small number of worker threads --- src/array/unsafe.rs | 2 +- src/darc.rs | 31 ++++++++++++++++++++----------- 2 files changed, 21 insertions(+), 12 deletions(-) diff --git a/src/array/unsafe.rs b/src/array/unsafe.rs index 4047e6d1..57d936ce 100644 --- a/src/array/unsafe.rs +++ b/src/array/unsafe.rs @@ -782,7 +782,7 @@ impl LamellarArray for UnsafeArray { { // std::thread::yield_now(); self.inner.data.team.scheduler.exec_task(); //mmight as well do useful work while we wait - if temp_now.elapsed() > Duration::new(600, 0) { + if temp_now.elapsed().as_secs_f64() > *crate::DEADLOCK_TIMEOUT { //|| first{ println!( "in array wait_all mype: {:?} cnt: {:?} {:?} {:?}", diff --git a/src/darc.rs b/src/darc.rs index fceef654..46ab891c 100644 --- a/src/darc.rs +++ b/src/darc.rs @@ -347,6 +347,7 @@ impl DarcInner { fn block_on_outstanding(&self, state: DarcMode, extra_cnt: usize) { self.wait_all(); let mut timer = std::time::Instant::now(); + let team = self.team(); while self.dist_cnt.load(Ordering::SeqCst) > 0 || self.local_cnt.load(Ordering::SeqCst) > 1 + extra_cnt || unsafe { self.any_ref_cnt() } @@ -355,9 +356,10 @@ impl DarcInner { self.send_finished(); } if timer.elapsed().as_secs_f64() > *crate::DEADLOCK_TIMEOUT { + let ref_cnts_slice = unsafe { std::slice::from_raw_parts_mut(self.ref_cnt_addr as *mut usize, self.num_pes) }; println!("[WARNING] - Potential deadlock detected.\n\ The runtime is currently waiting for all remaining references to this distributed object to be dropped.\n\ - This objected is likely a {:?} with {:?} remaining local references and {:?} remaining remote references\n\ + This objected is likely a {:?} with {:?} remaining local references and {:?} remaining remote references, ref cnts by pe {ref_cnts_slice:?}\n\ An example where this can occur can be found at https://docs.rs/lamellar/latest/lamellar/array/struct.ReadOnlyArray.html#method.into_local_lock\n\ The deadlock timeout can be set via the LAMELLAR_DEADLOCK_TIMEOUT environment variable, the current timeout is {} seconds\n\ To view backtrace set RUST_LIB_BACKTRACE=1\n\ @@ -382,9 +384,8 @@ impl DarcInner { // ); timer = std::time::Instant::now(); } - std::thread::yield_now(); + team.scheduler.exec_task(); } - let team = self.team(); let mode_refs = unsafe { std::slice::from_raw_parts_mut(self.mode_addr as *mut u8, self.num_pes) }; unsafe { @@ -406,9 +407,10 @@ impl DarcInner { self.send_finished(); } if timer.elapsed().as_secs_f64() > *crate::DEADLOCK_TIMEOUT { + let ref_cnts_slice = unsafe { std::slice::from_raw_parts_mut(self.ref_cnt_addr as *mut usize, self.num_pes) }; println!("[WARNING] -- Potential deadlock detected.\n\ The runtime is currently waiting for all remaining references to this distributed object to be dropped.\n\ - This objected is likely a {:?} with {:?} remaining local references and {:?} remaining remote references\n\ + This objected is likely a {:?} with {:?} remaining local references and {:?} remaining remote references, ref cnts by pe {ref_cnts_slice:?}\n\ An example where this can occur can be found at https://docs.rs/lamellar/latest/lamellar/array/struct.ReadOnlyArray.html#method.into_local_lock\n\ The deadlock timeout can be set via the LAMELLAR_DEADLOCK_TIMEOUT environment variable, the current timeout is {} seconds\n\ To view backtrace set RUST_LIB_BACKTRACE=1\n\ @@ -423,7 +425,7 @@ impl DarcInner { ); timer = std::time::Instant::now(); } - std::thread::yield_now(); + team.scheduler.exec_task(); } } while self.dist_cnt.load(Ordering::SeqCst) != 0 @@ -434,9 +436,10 @@ impl DarcInner { self.send_finished(); } if timer.elapsed().as_secs_f64() > *crate::DEADLOCK_TIMEOUT { + let ref_cnts_slice = unsafe { std::slice::from_raw_parts_mut(self.ref_cnt_addr as *mut usize, self.num_pes) }; println!("[WARNING] --- Potential deadlock detected.\n\ The runtime is currently waiting for all remaining references to this distributed object to be dropped.\n\ - This objected is likely a {:?} with {:?} remaining local references and {:?} remaining remote references\n\ + This objected is likely a {:?} with {:?} remaining local references and {:?} remaining remote references, ref cnts by pe {ref_cnts_slice:?}\n\ An example where this can occur can be found at https://docs.rs/lamellar/latest/lamellar/array/struct.ReadOnlyArray.html#method.into_local_lock\n\ The deadlock timeout can be set via the LAMELLAR_DEADLOCK_TIMEOUT environment variable, the current timeout is {} seconds\n\ To view backtrace set RUST_LIB_BACKTRACE=1\n\ @@ -451,7 +454,7 @@ impl DarcInner { ); timer = std::time::Instant::now(); } - std::thread::yield_now(); + team.scheduler.exec_task(); } // println!("{:?}",self); self.team().barrier(); @@ -465,7 +468,7 @@ impl DarcInner { while am_counters.outstanding_reqs.load(Ordering::SeqCst) > 0 { // std::thread::yield_now(); team.scheduler.exec_task(); //mmight as well do useful work while we wait - if temp_now.elapsed() > Duration::new(600, 0) { + if temp_now.elapsed().as_secs_f64() > *crate::DEADLOCK_TIMEOUT { //|| first{ println!( "in darc wait_all mype: {:?} cnt: {:?} {:?}", @@ -915,9 +918,11 @@ impl LamellarAM for DroppedWaitAM { wrapped.inner.as_ref().send_finished(); } if timeout.elapsed().as_secs_f64() > *crate::DEADLOCK_TIMEOUT { + let ref_cnts_slice = unsafe { std::slice::from_raw_parts_mut(wrapped.inner.as_ref().ref_cnt_addr as *mut usize, wrapped.inner.as_ref().num_pes) }; + println!("[WARNING] - Potential deadlock detected when trying to free distributed object.\n\ The runtime is currently waiting for all remaining references to this distributed object to be dropped.\n\ - The current status of the object on each pe is {:?} with {:?} remaining local references and {:?} remaining remote references\n\ + The current status of the object on each pe is {:?} with {:?} remaining local references and {:?} remaining remote references, ref cnts by pe {ref_cnts_slice:?}\n\ the deadlock timeout can be set via the LAMELLAR_DEADLOCK_TIMEOUT environment variable, the current timeout is {} seconds\n\ To view backtrace set RUST_LIB_BACKTRACE=1\n\ {}", @@ -952,9 +957,11 @@ impl LamellarAM for DroppedWaitAM { } } if timeout.elapsed().as_secs_f64() > *crate::DEADLOCK_TIMEOUT { + let ref_cnts_slice = unsafe { std::slice::from_raw_parts_mut(wrapped.inner.as_ref().ref_cnt_addr as *mut usize, wrapped.inner.as_ref().num_pes) }; + println!("[WARNING] -- Potential deadlock detected when trying to free distributed object.\n\ The runtime is currently waiting for all remaining references to this distributed object to be dropped.\n\ - The current status of the object on each pe is {:?} with {:?} remaining local references and {:?} remaining remote references\n\ + The current status of the object on each pe is {:?} with {:?} remaining local references and {:?} remaining remote references, ref cnts by pe {ref_cnts_slice:?}\n\ the deadlock timeout can be set via the LAMELLAR_DEADLOCK_TIMEOUT environment variable, the current timeout is {} seconds\n\ To view backtrace set RUST_LIB_BACKTRACE=1\n\ {}", @@ -984,9 +991,11 @@ impl LamellarAM for DroppedWaitAM { wrapped.inner.as_ref().send_finished(); } if timeout.elapsed().as_secs_f64() > *crate::DEADLOCK_TIMEOUT { + let ref_cnts_slice = unsafe { std::slice::from_raw_parts_mut(wrapped.inner.as_ref().ref_cnt_addr as *mut usize, wrapped.inner.as_ref().num_pes) }; + println!("[WARNING] --- Potential deadlock detected when trying to free distributed object.\n\ The runtime is currently waiting for all remaining references to this distributed object to be dropped.\n\ - The current status of the object on each pe is {:?} with {:?} remaining local references and {:?} remaining remote references\n\ + The current status of the object on each pe is {:?} with {:?} remaining local references and {:?} remaining remote references, ref cnts by pe {ref_cnts_slice:?}\n\ the deadlock timeout can be set via the LAMELLAR_DEADLOCK_TIMEOUT environment variable, the current timeout is {} seconds\n\ To view backtrace set RUST_LIB_BACKTRACE=1\n\ {}",