You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
cuda-gdb attach to MAIN pid to determine active kernel (other ranks do not have active kernels)
(cuda-gdb) info stack
#0 0x00007f64a9c1f480 in ncclDevFunc_AllGather_RING_SIMPLE() ()
#1 0x00007f601be4e600 in ncclDevKernel_AllGather_RING_LL(ncclDevKernelArgsStorage<4096ul>)<<<(24,1,1),(544,1,1)>>> ()
(cuda-gdb) info cuda kernels
Kernel Parent Dev Grid Status SMs Mask GridDim BlockDim Invocation
* 0 - 0 5059392810 Active 0x000000000000000000000000000fff0fff (24,1,1) (544,1,1) ncclDevKernel_AllGather_RING_LL()
some meaningful backtraces
Thread 16 (Thread 0x7f5ff8ffd000 (LWP 2791236) "executor_server"):
#0 0x00007f6b84ad47aa in pthread_cond_timedwait@@GLIBC_2.3.2 () from /home/askhoroshev/wmcore/lib/libpthread.so.0
#1 0x00007f6bf41a13ee in tensorrt_llm::executor::Executor::Impl::awaitResponses(std::optional<std::chrono::duration<long, std::ratio<1l, 1000l> > > const&) () from /home/askhoroshev/wmcore/lib/libtensorrt_llm.so
#2 0x00007f6bf4193ead in tensorrt_llm::executor::Executor::awaitResponses(std::optional<std::chrono::duration<long, std::ratio<1l, 1000l> > > const&) () from /home/askhoroshev/wmcore/lib/libtensorrt_llm.so
#3 0x000000000045d85b in modules::executor_server::Executor::ExecutorImpl::awaitRoutine (this=0x24c3180, interruptToken=...) at /sources/contrib/tensorrt-llm/modules/executor_server/src/serverImpl.cpp:330
#4 0x00000000004639c0 in operator() (__closure=<optimized out>, __closure=<optimized out>, stopToken=...) at /sources/contrib/tensorrt-llm/modules/executor_server/src/serverImpl.cpp:252
#5 __invoke_impl<void, modules::executor_server::Executor::ExecutorImpl::ExecutorImpl(modules::executor_server::MainRank, const std::filesystem::__cxx11::path&, const modules::executor_server::ExecutorConfig&, const modules::executor_server::LogitsProcessorStaticConfig&)::<lambda(std::stop_token)>, std::stop_token> (__f=...) at /opt/rh/gcc-toolset-10/root/usr/include/c++/10/bits/invoke.h:60
#6 __invoke<modules::executor_server::Executor::ExecutorImpl::ExecutorImpl(modules::executor_server::MainRank, const std::filesystem::__cxx11::path&, const modules::executor_server::ExecutorConfig&, const modules::executor_server::LogitsProcessorStaticConfig&)::<lambda(std::stop_token)>, std::stop_token> (__fn=...) at /opt/rh/gcc-toolset-10/root/usr/include/c++/10/bits/invoke.h:95
#7 _M_invoke<0, 1> (this=<optimized out>) at /opt/rh/gcc-toolset-10/root/usr/include/c++/10/thread:264
#8 operator() (this=<optimized out>) at /opt/rh/gcc-toolset-10/root/usr/include/c++/10/thread:271
#9 _M_run (this=<optimized out>) at /opt/rh/gcc-toolset-10/root/usr/include/c++/10/thread:215
#10 0x00007f6bcdb2ca80 in execute_native_thread_routine () from /home/askhoroshev/wmcore/lib/libtensorrt_llm_nvrtc_wrapper.so
#11 0x00007f6b84ace1ca in start_thread () from /home/askhoroshev/wmcore/lib/libpthread.so.0
#12 0x00007f6b83e0be73 in clone () from /home/askhoroshev/wmcore/lib/libc.so.6
Thread 15 (Thread 0x7f5ff97fe000 (LWP 2791234) "executionLoop"):
#0 0x00007f6b86894e88 in ?? () from /home/askhoroshev/wmcore/lib/libcuda.so.1
#1 0x00007f6b86631833 in ?? () from /home/askhoroshev/wmcore/lib/libcuda.so.1
#2 0x00007f6b8699fb3f in ?? () from /home/askhoroshev/wmcore/lib/libcuda.so.1
#3 0x00007f6b8699fed5 in ?? () from /home/askhoroshev/wmcore/lib/libcuda.so.1
#4 0x00007f6b866382cc in ?? () from /home/askhoroshev/wmcore/lib/libcuda.so.1
#5 0x00007f6b8670117a in ?? () from /home/askhoroshev/wmcore/lib/libcuda.so.1
#6 0x00007f6b86970459 in ?? () from /home/askhoroshev/wmcore/lib/libcuda.so.1
#7 0x00007f6b867a58fd in ?? () from /home/askhoroshev/wmcore/lib/libcuda.so.1
#8 0x000000000049f575 in libcudart_static_141dba5462e92d2cffd1abc474df476c510a3a8c ()
#9 0x0000000000504248 in cudaStreamSynchronize ()
#10 0x00000000004606c7 in tensorrt_llm::runtime::CudaStream::synchronize (this=<optimized out>) at /sources/contrib/tensorrt-llm/cpp/include/tensorrt_llm/runtime/cudaStream.h:84
#11 modules::executor_server::FusedLogitsProcessor::process (bufferManager=..., stream=..., beamTokens=..., tensorPtrs=<synthetic pointer>..., logitsRequestStates=..., this=0x24c31c0) at /sources/contrib/tensorrt-llm/modules/executor_server/src/logitsProcessor.cpp:658
#12 modules::executor_server::Executor::ExecutorImpl::LogitsProcessorBatchedAdaptor::operator() (this=0x13cac3b0, internalIds=..., logitTensors=..., beamTokens=..., stream=..., userIds=...) at /sources/contrib/tensorrt-llm/modules/executor_server/src/serverImpl.cpp:238
#13 0x00007f6bf419b315 in std::_Function_handler<void (std::vector<unsigned long, std::allocator<unsigned long> > const&, std::vector<std::shared_ptr<tensorrt_llm::runtime::ITensor>, std::allocator<std::shared_ptr<tensorrt_llm::runtime::ITensor--Type <RET> for more, q to quit, c to continue without paging--
> > >&, std::vector<std::reference_wrapper<std::vector<std::vector<int, std::allocator<int> >, std::allocator<std::vector<int, std::allocator<int> > > > const>, std::allocator<std::reference_wrapper<std::vector<std::vector<int, std::allocator<int> >, std::allocator<std::vector<int, std::allocator<int> > > > const> > > const&, std::shared_ptr<tensorrt_llm::runtime::CudaStream> const&, std::vector<std::optional<unsigned long>, std::allocator<std::optional<unsigned long> > > const&), tensorrt_llm::executor::Executor::Impl::initializeLogitsPostProcessorBatched(tensorrt_llm::executor::LogitsPostProcessorConfig const&)::{lambda(std::vector<unsigned long, std::allocator<unsigned long> > const&, std::vector<std::shared_ptr<tensorrt_llm::runtime::ITensor>, std::allocator<std::shared_ptr<tensorrt_llm::runtime::ITensor> > >&, std::vector<std::reference_wrapper<std::vector<std::vector<int, std::allocator<int> >, std::allocator<std::vector<int, std::allocator<int> > > > const>, std::allocator<std::reference_wrapper<std::vector<std::vector<int, std::allocator<int> >, std::allocator<std::vector<int, std::allocator<int> > > > const> > > const&, std::shared_ptr<tensorrt_llm::runtime::CudaStream> const&, std::vector<std::optional<unsigned long>, std::allocator<std::optional<unsigned long> > > const&)#1}>::_M_invoke(std::_Any_data const&, std::vector<unsigned long, std::allocator<unsigned long> > const&, std::vector<std::shared_ptr<tensorrt_llm::runtime::ITensor>, std::allocator<std::shared_ptr<tensorrt_llm::runtime::ITensor> > >&, std::vector<std::reference_wrapper<std::vector<std::vector<int, std::allocator<int> >, std::allocator<std::vector<int, std::allocator<int> > > > const>, std::allocator<std::reference_wrapper<std::vector<std::vector<int, std::allocator<int> >, std::allocator<std::vector<int, std::allocator<int> > > > const> > > const&, std::shared_ptr<tensorrt_llm::runtime::CudaStream> const&, std::vector<std::optional<unsigned long>, std::allocator<std::optional<unsigned long> > > const&) () from /home/askhoroshev/wmcore/lib/libtensorrt_llm.so
#14 0x00007f6bf41793c4 in tensorrt_llm::batch_manager::TrtGptModelInflightBatching::decoderStepAsync(tensorrt_llm::batch_manager::ScheduledRequests const&) () from /home/askhoroshev/wmcore/lib/libtensorrt_llm.so
#15 0x00007f6bf417cde5 in tensorrt_llm::batch_manager::TrtGptModelInflightBatching::forwardAsync(std::__cxx11::list<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest>, std::allocator<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest> > > const&) () from /home/askhoroshev/wmcore/lib/libtensorrt_llm.so
#16 0x00007f6bf41a6a71 in tensorrt_llm::executor::Executor::Impl::forwardAsync(std::__cxx11::list<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest>, std::allocator<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest> > >&) () from /home/askhoroshev/wmcore/lib/libtensorrt_llm.so
#17 0x00007f6bf41ab97f in tensorrt_llm::executor::Executor::Impl::executionLoop() () from /home/askhoroshev/wmcore/lib/libtensorrt_llm.so
#18 0x00007f6bcdb2ca80 in execute_native_thread_routine () from /home/askhoroshev/wmcore/lib/libtensorrt_llm_nvrtc_wrapper.so
#19 0x00007f6b84ace1ca in start_thread () from /home/askhoroshev/wmcore/lib/libpthread.so.0
#20 0x00007f6b83e0be73 in clone () from /home/askhoroshev/wmcore/lib/libc.so.6
Thread 14 (Thread 0x7f5ff9fff000 (LWP 2791231) "dataTransResp"):
#0 0x00007f6b84ad445c in pthread_cond_wait@@GLIBC_2.3.2 () from /home/askhoroshev/wmcore/lib/libpthread.so.0
#1 0x00007f6b847ed870 in std::condition_variable::wait(std::unique_lock<std::mutex>&) () from /home/askhoroshev/wmcore/lib/libstdc++.so.6
#2 0x00007f6bf4110d33 in tensorrt_llm::batch_manager::DataResponder::Impl::response() () from /home/askhoroshev/wmcore/lib/libtensorrt_llm.so
#3 0x00007f6bf410ecdd in std::_Function_handler<std::unique_ptr<std::__future_base::_Result_base, std::__future_base::_Result_base::_Deleter> (), std::__future_base::_Task_setter<std::unique_ptr<std::__future_base::_Result<void>, std::__future_base::_Result_base::_Deleter>, std::thread::_Invoker<std::tuple<void (tensorrt_llm::batch_manager::DataResponder::Impl::*)(), tensorrt_llm::batch_manager::DataResponder::Impl*> >, void> >::_M_invoke(std::_Any_data const&) () from /home/askhoroshev/wmcore/lib/libtensorrt_llm.so
#4 0x00007f6bf410f2fb in std::__future_base::_State_baseV2::_M_do_set(std::function<std::unique_ptr<std::__future_base::_Result_base, std::__future_base::_Result_base::_Deleter> ()>*, bool*) () from /home/askhoroshev/wmcore/lib/libtensorrt_llm.so
#5 0x00007f6b84ad5e67 in __pthread_once_slow () from /home/askhoroshev/wmcore/lib/libpthread.so.0
#6 0x00007f6bf410facd in std::thread::_State_impl<std::thread::_Invoker<std::tuple<std::__future_base::_Async_state_impl<std::thread::_Invoker<std::tuple<void (tensorrt_llm::batch_manager::DataResponder::Impl::*)(), tensorrt_llm::batch_manager::DataResponder::Impl*> >, void>::_Async_state_impl(std::thread::_Invoker<std::tuple<void (tensorrt_llm::batch_manager::DataResponder::Impl::*)(), tensorrt_llm::batch_manager::DataResponder::Impl*> >&&)::{lambda()#1}> > >::_M_run() () from /home/askhoroshev/wmcore/lib/libtensorrt_llm.so
#7 0x00007f6bcdb2ca80 in execute_native_thread_routine () from /home/askhoroshev/wmcore/lib/libtensorrt_llm_nvrtc_wrapper.so
#8 0x00007f6b84ace1ca in start_thread () from /home/askhoroshev/wmcore/lib/libpthread.so.0
#9 0x00007f6b83e0be73 in clone () from /home/askhoroshev/wmcore/lib/libc.so.6
Thread 13 (Thread 0x7f601d51d000 (LWP 2791219) "executor_server"):
#0 0x00007f6b83ef5f41 in poll () from /home/askhoroshev/wmcore/lib/libc.so.6
#1 0x00007f6b880ed3ea in ncclProxyServiceUDS(void*) () from /home/askhoroshev/wmcore/lib/libnccl.so.2
#2 0x00007f6b84ace1ca in start_thread () from /home/askhoroshev/wmcore/lib/libpthread.so.0
#3 0x00007f6b83e0be73 in clone () from /home/askhoroshev/wmcore/lib/libc.so.6
Thread 12 (Thread 0x7f694cee8000 (LWP 2791218) "executor_server"):
#0 0x00007f6b83ef5f41 in poll () from /home/askhoroshev/wmcore/lib/libc.so.6
#1 0x00007f6b880eed32 in ncclProxyService(void*) () from /home/askhoroshev/wmcore/lib/libnccl.so.2
#2 0x00007f6b84ace1ca in start_thread () from /home/askhoroshev/wmcore/lib/libpthread.so.0
#3 0x00007f6b83e0be73 in clone () from /home/askhoroshev/wmcore/lib/libc.so.6
Thread 11 (Thread 0x7f602cffd000 (LWP 2791205) "executor_server"):
#0 0x00007f6b84ad7ab4 in read () from /home/askhoroshev/wmcore/lib/libpthread.so.0
#1 0x00007f6b72cabfa3 in ibv_get_async_event () from /lib64/libibverbs.so.1
#2 0x00007f6b8811f042 in wrap_ibv_get_async_event(ibv_context*, ibv_async_event*) () from /home/askhoroshev/wmcore/lib/libnccl.so.2
#3 0x00007f6b88141b54 in ncclIbAsyncThreadMain(void*) () from /home/askhoroshev/wmcore/lib/libnccl.so.2
#4 0x00007f6b84ace1ca in start_thread () from /home/askhoroshev/wmcore/lib/libpthread.so.0
#5 0x00007f6b83e0be73 in clone () from /home/askhoroshev/wmcore/lib/libc.so.6
Thread 10 (Thread 0x7f602d7fe000 (LWP 2791202) "executor_server"):
#0 0x00007f6b84ad7ab4 in read () from /home/askhoroshev/wmcore/lib/libpthread.so.0
#1 0x00007f6b72cabfa3 in ibv_get_async_event () from /lib64/libibverbs.so.1
#2 0x00007f6b8811f042 in wrap_ibv_get_async_event(ibv_context*, ibv_async_event*) () from /home/askhoroshev/wmcore/lib/libnccl.so.2
#3 0x00007f6b88141b54 in ncclIbAsyncThreadMain(void*) () from /home/askhoroshev/wmcore/lib/libnccl.so.2
#4 0x00007f6b84ace1ca in start_thread () from /home/askhoroshev/wmcore/lib/libpthread.so.0
#5 0x00007f6b83e0be73 in clone () from /home/askhoroshev/wmcore/lib/libc.so.6
Thread 9 (Thread 0x7f602dfff000 (LWP 2791199) "executor_server"):
#0 0x00007f6b84ad7ab4 in read () from /home/askhoroshev/wmcore/lib/libpthread.so.0
--Type <RET> for more, q to quit, c to continue without paging--
#1 0x00007f6b72cabfa3 in ibv_get_async_event () from /lib64/libibverbs.so.1
#2 0x00007f6b8811f042 in wrap_ibv_get_async_event(ibv_context*, ibv_async_event*) () from /home/askhoroshev/wmcore/lib/libnccl.so.2
#3 0x00007f6b88141b54 in ncclIbAsyncThreadMain(void*) () from /home/askhoroshev/wmcore/lib/libnccl.so.2
#4 0x00007f6b84ace1ca in start_thread () from /home/askhoroshev/wmcore/lib/libpthread.so.0
#5 0x00007f6b83e0be73 in clone () from /home/askhoroshev/wmcore/lib/libc.so.6
Thread 8 (Thread 0x7f64aaedb000 (LWP 2791197) "executor_server"):
#0 0x00007f6b84ad7ab4 in read () from /home/askhoroshev/wmcore/lib/libpthread.so.0
#1 0x00007f6b72cabfa3 in ibv_get_async_event () from /lib64/libibverbs.so.1
#2 0x00007f6b8811f042 in wrap_ibv_get_async_event(ibv_context*, ibv_async_event*) () from /home/askhoroshev/wmcore/lib/libnccl.so.2
#3 0x00007f6b88141b54 in ncclIbAsyncThreadMain(void*) () from /home/askhoroshev/wmcore/lib/libnccl.so.2
#4 0x00007f6b84ace1ca in start_thread () from /home/askhoroshev/wmcore/lib/libpthread.so.0
#5 0x00007f6b83e0be73 in clone () from /home/askhoroshev/wmcore/lib/libc.so.6
Thread 7 (Thread 0x7f64ab6dc000 (LWP 2791195) "executor_server"):
#0 0x00007f6b84ad7ab4 in read () from /home/askhoroshev/wmcore/lib/libpthread.so.0
#1 0x00007f6b72cabfa3 in ibv_get_async_event () from /lib64/libibverbs.so.1
#2 0x00007f6b8811f042 in wrap_ibv_get_async_event(ibv_context*, ibv_async_event*) () from /home/askhoroshev/wmcore/lib/libnccl.so.2
#3 0x00007f6b88141b54 in ncclIbAsyncThreadMain(void*) () from /home/askhoroshev/wmcore/lib/libnccl.so.2
#4 0x00007f6b84ace1ca in start_thread () from /home/askhoroshev/wmcore/lib/libpthread.so.0
#5 0x00007f6b83e0be73 in clone () from /home/askhoroshev/wmcore/lib/libc.so.6
Thread 6 (Thread 0x7f64abedd000 (LWP 2791192) "executor_server"):
#0 0x00007f6b84ad7ab4 in read () from /home/askhoroshev/wmcore/lib/libpthread.so.0
#1 0x00007f6b72cabfa3 in ibv_get_async_event () from /lib64/libibverbs.so.1
#2 0x00007f6b8811f042 in wrap_ibv_get_async_event(ibv_context*, ibv_async_event*) () from /home/askhoroshev/wmcore/lib/libnccl.so.2
#3 0x00007f6b88141b54 in ncclIbAsyncThreadMain(void*) () from /home/askhoroshev/wmcore/lib/libnccl.so.2
#4 0x00007f6b84ace1ca in start_thread () from /home/askhoroshev/wmcore/lib/libpthread.so.0
#5 0x00007f6b83e0be73 in clone () from /home/askhoroshev/wmcore/lib/libc.so.6
Thread 5 (Thread 0x7f694efde000 (LWP 2790932) "cuda-EvtHandlr"):
#0 0x00007f6b84ad445c in pthread_cond_wait@@GLIBC_2.3.2 () from /home/askhoroshev/wmcore/lib/libpthread.so.0
#1 0x00007f52129f9d6a in ?? () from /lib64/libcudadebugger.so.1
#2 0x00007f52129f7e60 in ?? () from /lib64/libcudadebugger.so.1
#3 0x00007f52125cece5 in ?? () from /lib64/libcudadebugger.so.1
#4 0x00007f5212623601 in ?? () from /lib64/libcudadebugger.so.1
#5 0x00007f52125b4e2c in ?? () from /lib64/libcudadebugger.so.1
#6 0x00007f5212711526 in ?? () from /lib64/libcudadebugger.so.1
#7 0x00007f6b8696cffb in ?? () from /home/askhoroshev/wmcore/lib/libcuda.so.1
#8 0x00007f6b867ee6a4 in ?? () from /home/askhoroshev/wmcore/lib/libcuda.so.1
#9 0x00007f6b86721ee3 in ?? () from /home/askhoroshev/wmcore/lib/libcuda.so.1
#10 0x00007f6b84ace1ca in start_thread () from /home/askhoroshev/wmcore/lib/libpthread.so.0
#11 0x00007f6b83e0be73 in clone () from /home/askhoroshev/wmcore/lib/libc.so.6
Thread 4 (Thread 0x7f6b6eb7f000 (LWP 2790911) "cuda00006000019"):
#0 0x00007f6b83ef5f41 in poll () from /home/askhoroshev/wmcore/lib/libc.so.6
#1 0x00007f6b8672a1ef in ?? () from /home/askhoroshev/wmcore/lib/libcuda.so.1
#2 0x00007f6b867ee64f in ?? () from /home/askhoroshev/wmcore/lib/libcuda.so.1
#3 0x00007f6b86721ee3 in ?? () from /home/askhoroshev/wmcore/lib/libcuda.so.1
#4 0x00007f6b84ace1ca in start_thread () from /home/askhoroshev/wmcore/lib/libpthread.so.0
#5 0x00007f6b83e0be73 in clone () from /home/askhoroshev/wmcore/lib/libc.so.6
My version
tp4 fp8 deepseek like custom model. ExecutorApi (mpirun -n 4 command for start).
nccl version: 2.22.3
I use batched logit processor to control generation:
Main rank batch logit processor config (does logit modification)
Other ranks config (actually do nothing)
When running the stability test everything works fine for the first 20 hours, but after that time (or about 2 million requests) the server hangs.
cuda-gdb attach to MAIN pid to determine active kernel (other ranks do not have active kernels)
some meaningful backtraces
Is it related NVIDIA/nccl#311?
The text was updated successfully, but these errors were encountered: