From 6fbb2f70f64c0298153ebc11d89d25abf97c3b85 Mon Sep 17 00:00:00 2001 From: Xiaodong Wang Date: Mon, 11 Feb 2019 12:27:12 -0800 Subject: [PATCH] Catch cudaError_t return val (nodiscard in rocm) (#16399) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/16399 Catching cudaError_t return values in a few places, because it's nodiscard in rocm. Unless we add -Wno-unused-result, it'll end up with a compilation error. Also in c10/cuda/test, check whether a host has GPU or not. We were silently throwing out the error before (so not really testing the cuda api). Reviewed By: bddppq Differential Revision: D13828281 fbshipit-source-id: 587d1cc31c20b836ce9594e3c18f067d322b2934 --- c10/cuda/CUDACachingAllocator.cpp | 2 +- c10/cuda/impl/CUDAGuardImpl.h | 5 ++++- c10/cuda/impl/CUDATest.cpp | 14 ++++++++++++-- caffe2/core/context_gpu.h | 4 ++-- 4 files changed, 19 insertions(+), 6 deletions(-) diff --git a/c10/cuda/CUDACachingAllocator.cpp b/c10/cuda/CUDACachingAllocator.cpp index 11047b45b43094..884700beeac4bd 100644 --- a/c10/cuda/CUDACachingAllocator.cpp +++ b/c10/cuda/CUDACachingAllocator.cpp @@ -479,7 +479,7 @@ struct THCCachingAllocator cuda_events.emplace_back(event, block); } - cudaSetDevice(prev_device); + C10_CUDA_CHECK(cudaSetDevice(prev_device)); } void process_events() diff --git a/c10/cuda/impl/CUDAGuardImpl.h b/c10/cuda/impl/CUDAGuardImpl.h index 3da750bbe6e580..21b7298628e36f 100644 --- a/c10/cuda/impl/CUDAGuardImpl.h +++ b/c10/cuda/impl/CUDAGuardImpl.h @@ -39,7 +39,10 @@ struct CUDAGuardImpl final : public c10::impl::DeviceGuardImplInterface { C10_CUDA_CHECK(cudaSetDevice(d.index())); } void uncheckedSetDevice(Device d) const noexcept override { - cudaSetDevice(d.index()); + cudaError_t __err = cudaSetDevice(d.index()); + if (__err != cudaSuccess) { + AT_WARN("CUDA error: ", cudaGetErrorString(__err)); + } } Stream getStream(Device d) const noexcept override { return getCurrentCUDAStream().unwrap(); diff --git a/c10/cuda/impl/CUDATest.cpp b/c10/cuda/impl/CUDATest.cpp index f80cb95b045d05..3746d14ae51cf4 100644 --- a/c10/cuda/impl/CUDATest.cpp +++ b/c10/cuda/impl/CUDATest.cpp @@ -1,5 +1,6 @@ // Just a little test file to make sure that the CUDA library works +#include #include #include @@ -8,9 +9,18 @@ namespace c10 { namespace cuda { namespace impl { +bool has_cuda_gpu() { + int count; + C10_CUDA_CHECK(cudaGetDeviceCount(&count)); + + return count != 0; +} + int c10_cuda_test() { - int r; - cudaGetDevice(&r); + int r = 0; + if (has_cuda_gpu()) { + C10_CUDA_CHECK(cudaGetDevice(&r)); + } return r; } diff --git a/caffe2/core/context_gpu.h b/caffe2/core/context_gpu.h index 0e62708f165c72..9eb7fe5c83ea9f 100644 --- a/caffe2/core/context_gpu.h +++ b/caffe2/core/context_gpu.h @@ -203,7 +203,7 @@ class CAFFE2_CUDA_API CUDAContext final : public BaseContext { // FinishDeviceComputation must be called on the same cpu thread as // SwitchToDevice() void FinishDeviceComputation() override { - cudaStreamSynchronize(getCudaObjects().GetStream(gpu_id_)); + CUDA_ENFORCE(cudaStreamSynchronize(getCudaObjects().GetStream(gpu_id_))); cudaError_t error = cudaGetLastError(); if (error != cudaSuccess) { CAFFE_THROW("Encountered CUDA error: ", cudaGetErrorString(error)); @@ -390,7 +390,7 @@ struct CAFFE2_CUDA_API PinnedCPUAllocator final : public at::Allocator { if (err == cudaErrorInvalidValue) { free(data); // Calling cudaGetLastError will reset the cuda error. - cudaGetLastError(); + cudaError_t _err = cudaGetLastError(); } else { // For all other errors, still do a cuda check. CUDA_ENFORCE(err);