From c0f2d9a98e5e495aa06b62acd6cd78be95adc4c4 Mon Sep 17 00:00:00 2001 From: "Yu-Hsiang M. Tsai" Date: Tue, 29 Nov 2022 08:09:00 -0500 Subject: [PATCH 01/48] only can compile cuda/omp --- .../base/device_matrix_data_kernels.hpp.inc | 13 +- common/cuda_hip/components/reduction.hpp.inc | 2 +- .../jacobi_generate_kernel.hpp.inc | 2 +- common/unified/multigrid/pgm_kernels.cpp | 2 + core/base/extended_float.hpp | 210 ++++++++++++++++-- core/preconditioner/jacobi_utils.hpp | 12 +- cuda/base/math.hpp | 1 - cuda/base/types.hpp | 47 +++- cuda/solver/common_trs_kernels.cuh | 4 +- include/ginkgo/core/base/math.hpp | 65 +++++- include/ginkgo/core/base/types.hpp | 50 ++++- include/ginkgo/core/matrix/dense.hpp | 1 + omp/components/atomic.hpp | 30 +++ reference/matrix/diagonal_kernels.cpp | 1 + 14 files changed, 392 insertions(+), 48 deletions(-) diff --git a/common/cuda_hip/base/device_matrix_data_kernels.hpp.inc b/common/cuda_hip/base/device_matrix_data_kernels.hpp.inc index faf0ad15146..6046ef07b2b 100644 --- a/common/cuda_hip/base/device_matrix_data_kernels.hpp.inc +++ b/common/cuda_hip/base/device_matrix_data_kernels.hpp.inc @@ -39,9 +39,13 @@ void remove_zeros(std::shared_ptr exec, auto value_ptr = as_device_type(values.get_const_data()); auto size = values.get_num_elems(); // count nonzeros - auto nnz = thrust::count_if( - thrust_policy(exec), value_ptr, value_ptr + size, - [] __device__(device_value_type value) { return is_nonzero(value); }); + // __half != is only device, can not call __device__ from a __host__ + // __device__ (is_nonzero) + auto nnz = + thrust::count_if(thrust_policy(exec), value_ptr, value_ptr + size, + [] __device__(device_value_type value) { + return value != zero(value); + }); if (nnz < size) { using tuple_type = thrust::tuple; @@ -57,7 +61,8 @@ void remove_zeros(std::shared_ptr exec, as_device_type(new_values.get_data()))); thrust::copy_if(thrust_policy(exec), it, it + size, out_it, [] __device__(tuple_type entry) { - return is_nonzero(thrust::get<2>(entry)); + return thrust::get<2>(entry) != + zero(thrust::get<2>(entry)); }); // swap out storage values = std::move(new_values); diff --git a/common/cuda_hip/components/reduction.hpp.inc b/common/cuda_hip/components/reduction.hpp.inc index 39b3d3ffb37..9c3d3e4e014 100644 --- a/common/cuda_hip/components/reduction.hpp.inc +++ b/common/cuda_hip/components/reduction.hpp.inc @@ -75,7 +75,7 @@ __device__ __forceinline__ int choose_pivot(const Group& group, bool is_pivoted) { using real = remove_complex; - real lmag = is_pivoted ? -one() : abs(local_data); + real lmag = real(is_pivoted ? -one() : abs(local_data)); const auto pivot = reduce(group, group.thread_rank(), [&](int lidx, int ridx) { const auto rmag = group.shfl(lmag, ridx); diff --git a/common/cuda_hip/preconditioner/jacobi_generate_kernel.hpp.inc b/common/cuda_hip/preconditioner/jacobi_generate_kernel.hpp.inc index d5b9fb85551..808dffabfd4 100644 --- a/common/cuda_hip/preconditioner/jacobi_generate_kernel.hpp.inc +++ b/common/cuda_hip/preconditioner/jacobi_generate_kernel.hpp.inc @@ -72,7 +72,7 @@ __device__ __forceinline__ bool validate_precision_reduction_feasibility( } } - return succeeded && block_cond >= 1.0 && + return succeeded && block_cond >= remove_complex{1.0} && block_cond * static_cast>( float_traits>::eps) < remove_complex{1e-3}; diff --git a/common/unified/multigrid/pgm_kernels.cpp b/common/unified/multigrid/pgm_kernels.cpp index a61b32dacbd..eaf6d20cb3c 100644 --- a/common/unified/multigrid/pgm_kernels.cpp +++ b/common/unified/multigrid/pgm_kernels.cpp @@ -266,6 +266,7 @@ void assign_to_exist_agg(std::shared_ptr exec, [] GKO_KERNEL(auto row, auto row_ptrs, auto col_idxs, auto weight_vals, auto diag, auto agg_const_val, auto agg_val) { + using value_type = device_type; if (agg_val[row] != -1) { return; } @@ -304,6 +305,7 @@ void assign_to_exist_agg(std::shared_ptr exec, exec, [] GKO_KERNEL(auto row, auto row_ptrs, auto col_idxs, auto weight_vals, auto diag, auto agg_val) { + using value_type = device_type; if (agg_val[row] != -1) { return; } diff --git a/core/base/extended_float.hpp b/core/base/extended_float.hpp index 8f6ee2b0cb9..a337b405c00 100644 --- a/core/base/extended_float.hpp +++ b/core/base/extended_float.hpp @@ -101,6 +101,17 @@ struct basic_float_traits { static constexpr bool rounds_to_nearest = true; }; +#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) +template <> +struct basic_float_traits<__half> { + using type = __half; + static constexpr int sign_bits = 1; + static constexpr int significand_bits = 10; + static constexpr int exponent_bits = 5; + static constexpr bool rounds_to_nearest = true; +}; +#endif + template <> struct basic_float_traits { using type = float32; @@ -310,22 +321,30 @@ struct precision_converter { */ class half { public: - half() noexcept = default; + GKO_ATTRIBUTES half() noexcept = default; + + GKO_ATTRIBUTES half& operator=(const half& val) = default; + GKO_ATTRIBUTES half(const half& val) = default; + // GKO_ATTRIBUTES half(half const&) = default; + // complex() = default; - GKO_ATTRIBUTES half(float32 val) noexcept + // complex(const complex& z) = default; + + explicit GKO_ATTRIBUTES half(float32 val) noexcept { -#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) - const auto tmp = __float2half_rn(val); - data_ = reinterpret_cast(tmp); -#else // defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) - data_ = float2half(reinterpret_cast(val)); -#endif // defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) + this->float2half(val); } - GKO_ATTRIBUTES half(float64 val) noexcept : half(static_cast(val)) + explicit GKO_ATTRIBUTES half(float64 val) noexcept + : half(static_cast(val)) {} - GKO_ATTRIBUTES operator float32() const noexcept + explicit GKO_ATTRIBUTES half(int val) noexcept + : half(static_cast(val)) { + + } + + GKO_ATTRIBUTES operator float() const noexcept { #if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) return __half2float(reinterpret_cast(data_)); @@ -335,23 +354,159 @@ class half { #endif // defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) } - GKO_ATTRIBUTES operator float64() const noexcept +// #if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) +// GKO_ATTRIBUTES operator __half() noexcept +// { +// return reinterpret_cast(*this); +// } +// #endif // defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) + + + GKO_ATTRIBUTES half& operator+=(const float& rhs) { - return static_cast(static_cast(*this)); + auto val = *this + rhs; + this->float2half(val); + return *this; } - GKO_ATTRIBUTES half operator-() const noexcept + GKO_ATTRIBUTES half& operator/=(const float& rhs) { - auto res = *this; - // flip sign bit - res.data_ ^= f16_traits::sign_mask; - return res; + auto val = *this / rhs; + this->float2half(val); + return *this; + } + + GKO_ATTRIBUTES half& operator*=(const float& rhs) + { + auto val = *this * rhs; + this->float2half(val); + return *this; + } + + GKO_ATTRIBUTES half& operator-=(const float& rhs) + { + auto val = *this - rhs; + this->float2half(val); + return *this; + } + + // half& operator+=(const half& rhs) + // { + // auto val = *this + float(rhs); + // this->float2half(val); + // return *this; + // } + + // half& operator/=(const half& rhs) + // { + // auto val = *this / float(rhs); + // this->float2half(val); + // return *this; + // } + + // half& operator*=(const half& rhs) + // { + // auto val = *this * float(rhs); + // this->float2half(val); + // return *this; + // } + + // half& operator-=(const half& rhs) + // { + // auto val = *this - float(rhs); + // this->float2half(val); + // return *this; + // } + + GKO_ATTRIBUTES friend half operator+(half lhs, const half& rhs) + { + float flhs = lhs; + flhs += rhs; // reuse compound assignment + return half(flhs); + } + + GKO_ATTRIBUTES friend half operator-(half lhs, const half& rhs) + { + float flhs = lhs; + flhs -= rhs; // reuse compound assignment + return half(flhs); + } + + GKO_ATTRIBUTES friend half operator*(half lhs, const half& rhs) + { + float flhs = lhs; + flhs *= rhs; // reuse compound assignment + return half(flhs); + } + + GKO_ATTRIBUTES friend half operator/(half lhs, const half& rhs) + { + float flhs = lhs; + flhs /= rhs; // reuse compound assignment + return half(flhs); + } + + + GKO_ATTRIBUTES friend half operator+(half lhs, const float& rhs) + { + float flhs = lhs; + flhs += rhs; // reuse compound assignment + return half(flhs); + } + + GKO_ATTRIBUTES friend half operator-(half lhs, const float& rhs) + { + float flhs = lhs; + flhs -= rhs; // reuse compound assignment + return half(flhs); + } + + GKO_ATTRIBUTES friend half operator*(half lhs, const float& rhs) + { + float flhs = lhs; + flhs *= rhs; // reuse compound assignment + return half(flhs); + } + + GKO_ATTRIBUTES friend half operator/(half lhs, const float& rhs) + { + float flhs = lhs; + flhs /= rhs; // reuse compound assignment + return half(flhs); + } + + GKO_ATTRIBUTES half& operator=(int val) + { + this->float2half(float(val)); + return *this; + } + + GKO_ATTRIBUTES half& operator=(float val) + { + this->float2half(val); + return *this; + } + + GKO_ATTRIBUTES half& operator=(double val) + { + this->float2half(static_cast(val)); + return *this; } private: using f16_traits = detail::float_traits; using f32_traits = detail::float_traits; + GKO_ATTRIBUTES void float2half(float val) noexcept + { +#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) + const auto tmp = __float2half_rn(val); + data_ = reinterpret_cast(tmp); +#else // defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) + data_ = float2half(reinterpret_cast(val)); +#endif // defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) + } + static uint16 float2half(uint32 data_) noexcept { using conv = detail::precision_converter; @@ -493,9 +648,14 @@ class complex { public: using value_type = gko::half; - complex(const value_type& real = 0.f, const value_type& imag = 0.f) + complex(const value_type& real = value_type(0.f), + const value_type& imag = value_type(0.f)) : real_(real), imag_(imag) {} + template + explicit complex(const T& real, const U& imag) + : complex(static_cast(real), static_cast(imag)) + {} template explicit complex(const complex& other) @@ -574,6 +734,20 @@ struct numeric_limits { { return gko::detail::float_traits::eps; } + + static constexpr float infinity() + { + return numeric_limits::infinity(); + } + + static constexpr float min() { return numeric_limits::min(); } + + static constexpr float max() { return numeric_limits::max(); } + + static constexpr float quiet_NaN() + { + return numeric_limits::quiet_NaN(); + } }; } // namespace std diff --git a/core/preconditioner/jacobi_utils.hpp b/core/preconditioner/jacobi_utils.hpp index 957d5b4a324..f929fcc5eba 100644 --- a/core/preconditioner/jacobi_utils.hpp +++ b/core/preconditioner/jacobi_utils.hpp @@ -144,21 +144,23 @@ GKO_ATTRIBUTES GKO_INLINE uint32 get_supported_storage_reductions( auto supported = static_cast(prd::p0n0); // the following code uses short-circuiting to avoid calling possibly // expensive verificatiors multiple times - if (accurate(float_traits>>::eps)) { + if (accurate(type(float_traits>>::eps))) { supported |= prd::p2n0; } - if (accurate(float_traits>>::eps) && + if (accurate( + type(float_traits>>::eps)) && (is_verified1 = verificator1())) { supported |= prd::p1n1; } - if (accurate(float_traits>>::eps) && + if (accurate(type( + float_traits>>::eps)) && is_verified1 != 0 && verificator2()) { supported |= prd::p0n2; } - if (accurate(float_traits>::eps)) { + if (accurate(type(float_traits>::eps))) { supported |= prd::p1n0; } - if (accurate(float_traits>::eps) && + if (accurate(type(float_traits>::eps)) && (is_verified1 == 1 || (is_verified1 == 2 && (is_verified1 = verificator1())))) { supported |= prd::p0n1; diff --git a/cuda/base/math.hpp b/cuda/base/math.hpp index 07243caa25f..026c570957c 100644 --- a/cuda/base/math.hpp +++ b/cuda/base/math.hpp @@ -39,7 +39,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include - namespace gko { diff --git a/cuda/base/types.hpp b/cuda/base/types.hpp index 20dbccbe785..90767061ea2 100644 --- a/cuda/base/types.hpp +++ b/cuda/base/types.hpp @@ -50,16 +50,33 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +namespace std { + +template <> +struct is_scalar<__half> : std::true_type {}; + +} // namespace std + namespace gko { +#if defined(__CUDA_ARCH__) +template <> +__device__ __forceinline__ bool is_nan(const __half& val) +{ + return is_nan(float(val)); +} +#endif namespace kernels { namespace cuda { +#if defined(__CUDA_ARCH__) +// template <> +__device__ __forceinline__ __half abs(const __half& val) { return __habs(val); } - +__device__ __forceinline__ __half sqrt(const __half& val) { return hsqrt(val); } +#endif namespace detail { - /** * @internal * @@ -156,6 +173,17 @@ struct culibs_type_impl> { using type = cuDoubleComplex; }; + +template <> +struct culibs_type_impl { + using type = __half; +}; + +template <> +struct culibs_type_impl> { + using type = __half2; +}; + template struct culibs_type_impl> { using type = typename culibs_type_impl>::type; @@ -186,6 +214,11 @@ struct cuda_type_impl { using type = volatile typename cuda_type_impl::type; }; +template <> +struct cuda_type_impl { + using type = __half; +}; + template struct cuda_type_impl> { using type = thrust::complex; @@ -201,6 +234,11 @@ struct cuda_type_impl { using type = thrust::complex; }; +template <> +struct cuda_type_impl<__half2> { + using type = thrust::complex<__half>; +}; + template struct cuda_struct_member_type_impl { using type = T; @@ -211,6 +249,11 @@ struct cuda_struct_member_type_impl> { using type = fake_complex; }; +template <> +struct cuda_struct_member_type_impl { + using type = __half; +}; + template struct cuda_type_impl> { using type = matrix_data_entry< diff --git a/cuda/solver/common_trs_kernels.cuh b/cuda/solver/common_trs_kernels.cuh index 6ee2c7521ff..fa6e1de79fa 100644 --- a/cuda/solver/common_trs_kernels.cuh +++ b/cuda/solver/common_trs_kernels.cuh @@ -484,7 +484,7 @@ __global__ void sptrsv_naive_legacy_kernel( const auto row_end = is_upper ? rowptrs[row] - 1 : rowptrs[row + 1]; const int row_step = is_upper ? -1 : 1; - ValueType sum = 0.0; + ValueType sum = ValueType{0.0}; auto j = row_begin; auto col = colidxs[j]; while (j != row_end) { @@ -538,7 +538,7 @@ void sptrsv_naive_caching(std::shared_ptr exec, const auto nrhs = b->get_size()[1]; // Initialize x to all NaNs. - dense::fill(exec, x, nan()); + dense::fill(exec, x, ValueType(nan())); array nan_produced(exec, 1); array atomic_counter(exec, 1); diff --git a/include/ginkgo/core/base/math.hpp b/include/ginkgo/core/base/math.hpp index 70e4db5bb2d..5ae33d385cf 100644 --- a/include/ginkgo/core/base/math.hpp +++ b/include/ginkgo/core/base/math.hpp @@ -62,6 +62,13 @@ using std::abs; using std::sqrt; +inline half abs(half a) { return half((a > 0) ? a : -a); } +inline half abs(std::complex a) +{ + return half(sqrt(float(a.real() * a.real() + a.imag() * a.imag()))); +} +inline half sqrt(half a) { return half(sqrt(float(a))); } + } // namespace reference } // namespace kernels @@ -76,6 +83,14 @@ using std::abs; using std::sqrt; +inline half abs(half a) { return half((a > 0) ? a : -a); } +inline half abs(std::complex a) +{ + return half(sqrt(float(a.real() * a.real() + a.imag() * a.imag()))); +} +inline half sqrt(half a) { return half(sqrt(float(a))); } + + } // namespace omp } // namespace kernels @@ -389,6 +404,11 @@ namespace detail { template struct next_precision_impl {}; +template <> +struct next_precision_impl { + using type = float; +}; + template <> struct next_precision_impl { using type = double; @@ -396,7 +416,7 @@ struct next_precision_impl { template <> struct next_precision_impl { - using type = float; + using type = half; }; template @@ -447,11 +467,22 @@ struct increase_precision_impl { }; +template +struct arth_type { + using type = T; +}; + +template <> +struct arth_type { + using type = float; +}; + template struct infinity_impl { // CUDA doesn't allow us to call std::numeric_limits functions // so we need to store the value instead. - static constexpr auto value = std::numeric_limits::infinity(); + static constexpr auto value = + std::numeric_limits::type>::infinity(); }; @@ -655,7 +686,7 @@ GKO_INLINE GKO_ATTRIBUTES constexpr int64 ceildiv(int64 num, int64 den) template GKO_INLINE __host__ constexpr T zero() { - return T{}; + return T(0.0); } @@ -683,7 +714,7 @@ GKO_INLINE __host__ constexpr T zero(const T&) template GKO_INLINE __host__ constexpr T one() { - return T(1); + return T(1.0); } @@ -713,7 +744,7 @@ GKO_INLINE __device__ constexpr std::enable_if_t< !std::is_same>>::value, T> zero() { - return T{}; + return T(0.0); } @@ -743,7 +774,7 @@ GKO_INLINE __device__ constexpr std::enable_if_t< !std::is_same>>::value, T> one() { - return T(1); + return T(1.0); } @@ -774,7 +805,7 @@ GKO_INLINE __device__ constexpr T one(const T&) template GKO_INLINE GKO_ATTRIBUTES constexpr T zero() { - return T{}; + return T(half{0.0}); } @@ -802,7 +833,7 @@ GKO_INLINE GKO_ATTRIBUTES constexpr T zero(const T&) template GKO_INLINE GKO_ATTRIBUTES constexpr T one() { - return T(1); + return T(1.0); } @@ -998,7 +1029,7 @@ template GKO_ATTRIBUTES GKO_INLINE constexpr std::enable_if_t::value, T> imag_impl(const T&) { - return T{}; + return T(0.0); } template @@ -1103,7 +1134,7 @@ GKO_INLINE GKO_ATTRIBUTES constexpr xstd::enable_if_t::value, T> abs(const T& x) { - return x >= zero() ? x : -x; + return x >= zero() ? x : static_cast(-x); } @@ -1198,7 +1229,8 @@ template GKO_INLINE GKO_ATTRIBUTES std::enable_if_t::value, bool> is_finite(const T& value) { - constexpr T infinity{detail::infinity_impl::value}; + constexpr typename detail::arth_type::type infinity{ + detail::infinity_impl::value}; return abs(value) < infinity; } @@ -1282,7 +1314,16 @@ GKO_INLINE GKO_ATTRIBUTES std::enable_if_t::value, bool> is_nan( * @return NaN. */ template -GKO_INLINE GKO_ATTRIBUTES constexpr std::enable_if_t::value, T> +GKO_INLINE GKO_ATTRIBUTES constexpr std::enable_if_t< + !is_complex_s::value && !std::is_same::value, T> +nan() +{ + return std::numeric_limits::quiet_NaN(); +} + +template +GKO_INLINE GKO_ATTRIBUTES constexpr std::enable_if_t< + std::is_same::value, float> nan() { return std::numeric_limits::quiet_NaN(); diff --git a/include/ginkgo/core/base/types.hpp b/include/ginkgo/core/base/types.hpp index 68b5da6e3eb..aae25a7e1ad 100644 --- a/include/ginkgo/core/base/types.hpp +++ b/include/ginkgo/core/base/types.hpp @@ -429,11 +429,13 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, */ #if GINKGO_DPCPP_SINGLE_MODE #define GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE(_macro) \ + template _macro(half); \ template _macro(float); \ template <> \ _macro(double) GKO_NOT_IMPLEMENTED #else #define GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE(_macro) \ + template _macro(half); \ template _macro(float); \ template _macro(double) #endif @@ -450,12 +452,14 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, #if GINKGO_DPCPP_SINGLE_MODE #define GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(_macro) \ GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE(_macro); \ + template _macro(std::complex); \ template _macro(std::complex); \ template <> \ _macro(std::complex) GKO_NOT_IMPLEMENTED #else #define GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(_macro) \ GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE(_macro); \ + template _macro(std::complex); \ template _macro(std::complex); \ template _macro(std::complex) #endif @@ -473,21 +477,27 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, */ #if GINKGO_DPCPP_SINGLE_MODE #define GKO_INSTANTIATE_FOR_EACH_VALUE_AND_SCALAR_TYPE(_macro) \ + template _macro(half, half); \ template _macro(float, float); \ template <> \ _macro(double, double) GKO_NOT_IMPLEMENTED; \ + template _macro(std::complex, std::complex); \ template _macro(std::complex, std::complex); \ template <> \ _macro(std::complex, std::complex) GKO_NOT_IMPLEMENTED; \ + template _macro(std::complex, half); \ template _macro(std::complex, float); \ template <> \ _macro(std::complex, double) GKO_NOT_IMPLEMENTED; #else #define GKO_INSTANTIATE_FOR_EACH_VALUE_AND_SCALAR_TYPE(_macro) \ + template _macro(half, half); \ template _macro(float, float); \ template _macro(double, double); \ + template _macro(std::complex, std::complex); \ template _macro(std::complex, std::complex); \ template _macro(std::complex, std::complex); \ + template _macro(std::complex, half); \ template _macro(std::complex, float); \ template _macro(std::complex, double) #endif @@ -517,16 +527,20 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, */ #if GINKGO_DPCPP_SINGLE_MODE #define GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_AND_INDEX_TYPE(_macro) \ + template _macro(half, int32); \ template _macro(float, int32); \ template <> \ _macro(double, int32) GKO_NOT_IMPLEMENTED; \ + template _macro(half, int64); \ template _macro(float, int64); \ template <> \ _macro(double, int64) GKO_NOT_IMPLEMENTED #else #define GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_AND_INDEX_TYPE(_macro) \ + template _macro(half, int32); \ template _macro(float, int32); \ template _macro(double, int32); \ + template _macro(half, int64); \ template _macro(float, int64); \ template _macro(double, int64) #endif @@ -543,17 +557,21 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, #if GINKGO_DPCPP_SINGLE_MODE #define GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(_macro) \ GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_AND_INDEX_TYPE(_macro); \ + template _macro(std::complex, int32); \ template _macro(std::complex, int32); \ template <> \ _macro(std::complex, int32) GKO_NOT_IMPLEMENTED; \ + template _macro(std::complex, int64); \ template _macro(std::complex, int64); \ template <> \ _macro(std::complex, int64) GKO_NOT_IMPLEMENTED #else #define GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(_macro) \ GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_AND_INDEX_TYPE(_macro); \ + template _macro(std::complex, int32); \ template _macro(std::complex, int32); \ template _macro(std::complex, int32); \ + template _macro(std::complex, int64); \ template _macro(std::complex, int64); \ template _macro(std::complex, int64) #endif @@ -571,6 +589,9 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, #if GINKGO_DPCPP_SINGLE_MODE #define GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE( \ _macro) \ + template _macro(half, int32, int32); \ + template _macro(half, int32, int64); \ + template _macro(half, int64, int64); \ template _macro(float, int32, int32); \ template _macro(float, int32, int64); \ template _macro(float, int64, int64); \ @@ -583,6 +604,9 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, #else #define GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE( \ _macro) \ + template _macro(half, int32, int32); \ + template _macro(half, int32, int64); \ + template _macro(half, int64, int64); \ template _macro(float, int32, int32); \ template _macro(float, int32, int64); \ template _macro(float, int64, int64); \ @@ -604,6 +628,9 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, #define GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(_macro) \ GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE( \ _macro); \ + template _macro(std::complex, int32, int32); \ + template _macro(std::complex, int32, int64); \ + template _macro(std::complex, int64, int64); \ template _macro(std::complex, int32, int32); \ template _macro(std::complex, int32, int64); \ template _macro(std::complex, int64, int64); \ @@ -617,6 +644,9 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, #define GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(_macro) \ GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE( \ _macro); \ + template _macro(std::complex, int32, int32); \ + template _macro(std::complex, int32, int64); \ + template _macro(std::complex, int64, int64); \ template _macro(std::complex, int32, int32); \ template _macro(std::complex, int32, int64); \ template _macro(std::complex, int64, int64); \ @@ -659,8 +689,16 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, #define GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION(_macro) \ template _macro(float, double); \ template _macro(double, float); \ + template _macro(half, double); \ + template _macro(double, half); \ + template _macro(float, half); \ + template _macro(half, float); \ template _macro(std::complex, std::complex); \ - template _macro(std::complex, std::complex) + template _macro(std::complex, std::complex); \ + template _macro(std::complex, std::complex); \ + template _macro(std::complex, std::complex); \ + template _macro(std::complex, std::complex); \ + template _macro(std::complex, std::complex) /** @@ -674,8 +712,10 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, */ #define GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION_OR_COPY(_macro) \ GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION(_macro); \ + template _macro(half, half); \ template _macro(float, float); \ template _macro(double, double); \ + template _macro(std::complex, std::complex); \ template _macro(std::complex, std::complex); \ template _macro(std::complex, std::complex) #endif @@ -690,10 +730,13 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, * value and index types. */ #define GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_PAIR(_macro) \ + template _macro(half, half); \ template _macro(float, float); \ template _macro(double, double); \ + template _macro(std::complex, half); \ template _macro(std::complex, float); \ template _macro(std::complex, double); \ + template _macro(std::complex, std::complex); \ template _macro(std::complex, std::complex); \ template _macro(std::complex, std::complex) @@ -713,9 +756,11 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, template _macro(int64, int64); \ template _macro(unsigned int, unsigned int); \ template _macro(unsigned long, unsigned long); \ + template _macro(half, half); \ template _macro(float, float); \ template _macro(double, double); \ template _macro(long double, long double); \ + template _macro(std::complex, std::complex); \ template _macro(std::complex, std::complex); \ template _macro(std::complex, std::complex) @@ -728,6 +773,7 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, * value and index types. */ #define GKO_INSTANTIATE_FOR_EACH_POD_TYPE(_macro) \ + template _macro(half); \ template _macro(float); \ template _macro(double); \ template _macro(std::complex); \ @@ -795,5 +841,5 @@ using comm_index_type = int; } // namespace experimental } // namespace gko - +#include "core/base/extended_float.hpp" #endif // GKO_PUBLIC_CORE_BASE_TYPES_HPP_ diff --git a/include/ginkgo/core/matrix/dense.hpp b/include/ginkgo/core/matrix/dense.hpp index ae738d49b93..03460ddb861 100644 --- a/include/ginkgo/core/matrix/dense.hpp +++ b/include/ginkgo/core/matrix/dense.hpp @@ -108,6 +108,7 @@ class Dense : public EnableLinOp>, public EnableCreateMethod>, public ConvertibleTo>>, + public ConvertibleTo>>>, public ConvertibleTo>, public ConvertibleTo>, public ConvertibleTo>, diff --git a/omp/components/atomic.hpp b/omp/components/atomic.hpp index 9ff4cee376a..4f8c3fa4e6a 100644 --- a/omp/components/atomic.hpp +++ b/omp/components/atomic.hpp @@ -68,6 +68,36 @@ void atomic_add(ValueType& out, ValueType val) } +template +inline ResultType reinterpret(ValueType val) +{ + static_assert(sizeof(ValueType) == sizeof(ResultType), + "The type to reinterpret to must be of the same size as the " + "original type."); + return reinterpret_cast(val); +} + + +template <> +void atomic_add(half& out, half val) +{ + // UB? + uint16_t* address_as_converter = reinterpret_cast(&out); + uint16_t old = *address_as_converter; + uint16_t assumed; + do { + assumed = old; + auto answer = reinterpret(reinterpret(assumed) + val); +#pragma omp atomic capture +{ + old = *address_as_converter; + *address_as_converter = (old == assumed) ? answer : old; +} + } while (assumed != old); + +} // namespace omp + + } // namespace omp } // namespace kernels } // namespace gko diff --git a/reference/matrix/diagonal_kernels.cpp b/reference/matrix/diagonal_kernels.cpp index 61ccfdd0620..03ce332192f 100644 --- a/reference/matrix/diagonal_kernels.cpp +++ b/reference/matrix/diagonal_kernels.cpp @@ -35,6 +35,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include +#include "core/base/extended_float.hpp" namespace gko { From ac1dbb22ea0c5d757b2804d0825b8835d87ec923 Mon Sep 17 00:00:00 2001 From: "Yu-Hsiang M. Tsai" Date: Thu, 5 Jan 2023 15:35:28 -0600 Subject: [PATCH 02/48] next_precision to itself when complex only float, double add empty conditional --- include/ginkgo/core/base/math.hpp | 18 ++++++++++++++---- include/ginkgo/core/base/types.hpp | 26 +------------------------- include/ginkgo/core/matrix/dense.hpp | 8 +++++++- 3 files changed, 22 insertions(+), 30 deletions(-) diff --git a/include/ginkgo/core/base/math.hpp b/include/ginkgo/core/base/math.hpp index 5ae33d385cf..4687602835b 100644 --- a/include/ginkgo/core/base/math.hpp +++ b/include/ginkgo/core/base/math.hpp @@ -419,11 +419,21 @@ struct next_precision_impl { using type = half; }; -template -struct next_precision_impl> { - using type = std::complex::type>; +template <> +struct next_precision_impl> { + using type = std::complex; }; +template <> +struct next_precision_impl> { + using type = std::complex; +}; + +// template +// struct next_precision_impl> { +// using type = std::complex::type>; +// }; + template struct reduce_precision_impl { @@ -805,7 +815,7 @@ GKO_INLINE __device__ constexpr T one(const T&) template GKO_INLINE GKO_ATTRIBUTES constexpr T zero() { - return T(half{0.0}); + return T(0.0); } diff --git a/include/ginkgo/core/base/types.hpp b/include/ginkgo/core/base/types.hpp index aae25a7e1ad..a2fd5234cb2 100644 --- a/include/ginkgo/core/base/types.hpp +++ b/include/ginkgo/core/base/types.hpp @@ -452,14 +452,12 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, #if GINKGO_DPCPP_SINGLE_MODE #define GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(_macro) \ GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE(_macro); \ - template _macro(std::complex); \ template _macro(std::complex); \ template <> \ _macro(std::complex) GKO_NOT_IMPLEMENTED #else #define GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(_macro) \ GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE(_macro); \ - template _macro(std::complex); \ template _macro(std::complex); \ template _macro(std::complex) #endif @@ -481,11 +479,9 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, template _macro(float, float); \ template <> \ _macro(double, double) GKO_NOT_IMPLEMENTED; \ - template _macro(std::complex, std::complex); \ template _macro(std::complex, std::complex); \ template <> \ _macro(std::complex, std::complex) GKO_NOT_IMPLEMENTED; \ - template _macro(std::complex, half); \ template _macro(std::complex, float); \ template <> \ _macro(std::complex, double) GKO_NOT_IMPLEMENTED; @@ -494,10 +490,8 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, template _macro(half, half); \ template _macro(float, float); \ template _macro(double, double); \ - template _macro(std::complex, std::complex); \ template _macro(std::complex, std::complex); \ template _macro(std::complex, std::complex); \ - template _macro(std::complex, half); \ template _macro(std::complex, float); \ template _macro(std::complex, double) #endif @@ -557,21 +551,17 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, #if GINKGO_DPCPP_SINGLE_MODE #define GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(_macro) \ GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_AND_INDEX_TYPE(_macro); \ - template _macro(std::complex, int32); \ template _macro(std::complex, int32); \ template <> \ _macro(std::complex, int32) GKO_NOT_IMPLEMENTED; \ - template _macro(std::complex, int64); \ template _macro(std::complex, int64); \ template <> \ _macro(std::complex, int64) GKO_NOT_IMPLEMENTED #else #define GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(_macro) \ GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_AND_INDEX_TYPE(_macro); \ - template _macro(std::complex, int32); \ template _macro(std::complex, int32); \ template _macro(std::complex, int32); \ - template _macro(std::complex, int64); \ template _macro(std::complex, int64); \ template _macro(std::complex, int64) #endif @@ -628,9 +618,6 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, #define GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(_macro) \ GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE( \ _macro); \ - template _macro(std::complex, int32, int32); \ - template _macro(std::complex, int32, int64); \ - template _macro(std::complex, int64, int64); \ template _macro(std::complex, int32, int32); \ template _macro(std::complex, int32, int64); \ template _macro(std::complex, int64, int64); \ @@ -644,9 +631,6 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, #define GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(_macro) \ GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE( \ _macro); \ - template _macro(std::complex, int32, int32); \ - template _macro(std::complex, int32, int64); \ - template _macro(std::complex, int64, int64); \ template _macro(std::complex, int32, int32); \ template _macro(std::complex, int32, int64); \ template _macro(std::complex, int64, int64); \ @@ -694,11 +678,7 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, template _macro(float, half); \ template _macro(half, float); \ template _macro(std::complex, std::complex); \ - template _macro(std::complex, std::complex); \ - template _macro(std::complex, std::complex); \ - template _macro(std::complex, std::complex); \ - template _macro(std::complex, std::complex); \ - template _macro(std::complex, std::complex) + template _macro(std::complex, std::complex) /** @@ -715,7 +695,6 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, template _macro(half, half); \ template _macro(float, float); \ template _macro(double, double); \ - template _macro(std::complex, std::complex); \ template _macro(std::complex, std::complex); \ template _macro(std::complex, std::complex) #endif @@ -733,10 +712,8 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, template _macro(half, half); \ template _macro(float, float); \ template _macro(double, double); \ - template _macro(std::complex, half); \ template _macro(std::complex, float); \ template _macro(std::complex, double); \ - template _macro(std::complex, std::complex); \ template _macro(std::complex, std::complex); \ template _macro(std::complex, std::complex) @@ -760,7 +737,6 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, template _macro(float, float); \ template _macro(double, double); \ template _macro(long double, long double); \ - template _macro(std::complex, std::complex); \ template _macro(std::complex, std::complex); \ template _macro(std::complex, std::complex) diff --git a/include/ginkgo/core/matrix/dense.hpp b/include/ginkgo/core/matrix/dense.hpp index 03460ddb861..b65afa31ef0 100644 --- a/include/ginkgo/core/matrix/dense.hpp +++ b/include/ginkgo/core/matrix/dense.hpp @@ -88,6 +88,8 @@ template class SparsityCsr; +class Empty {}; + /** * Dense is a matrix format which explicitly stores all values of the matrix. * @@ -108,7 +110,11 @@ class Dense : public EnableLinOp>, public EnableCreateMethod>, public ConvertibleTo>>, - public ConvertibleTo>>>, + public std::conditional< + std::is_same>, + ValueType>::value, + Empty, + ConvertibleTo>>>>, public ConvertibleTo>, public ConvertibleTo>, public ConvertibleTo>, From 7bffc78b8bc0e0252b83eecf7d2ff5d7380c99eb Mon Sep 17 00:00:00 2001 From: "Yu-Hsiang M. Tsai" Date: Sat, 7 Jan 2023 21:14:29 -0600 Subject: [PATCH 03/48] can compile with cuda/omp/ref (without test) --- accessor/cuda_helper.hpp | 7 +- .../unified/components/fill_array_kernels.cpp | 2 +- core/base/extended_float.hpp | 147 ++++++++++++++---- core/base/mixed_precision_types.hpp | 93 ++++++++++- core/base/mtx_io.cpp | 17 +- core/matrix/coo.cpp | 19 +++ core/matrix/csr.cpp | 20 +++ core/matrix/dense.cpp | 27 +++- core/matrix/diagonal.cpp | 16 ++ core/matrix/ell.cpp | 20 +++ core/matrix/fbcsr.cpp | 21 +++ core/matrix/hybrid.cpp | 21 +++ core/matrix/row_gatherer.cpp | 6 +- core/matrix/sellp.cpp | 21 +++ core/multigrid/pgm.cpp | 2 +- core/preconditioner/jacobi.cpp | 2 +- core/solver/cb_gmres.cpp | 4 +- core/solver/multigrid.cpp | 14 +- core/stop/residual_norm.cpp | 4 +- cuda/CMakeLists.txt | 1 + cuda/base/types.hpp | 2 +- cuda/matrix/fft_kernels.cu | 6 +- cuda/solver/cb_gmres_kernels.cu | 3 +- cuda/solver/idr_kernels.cu | 8 +- dpcpp/solver/cb_gmres_kernels.dp.cpp | 3 +- hip/matrix/csr_kernels.instantiate.hip.cpp | 24 +++ hip/solver/cb_gmres_kernels.hip.cpp | 3 +- include/ginkgo/core/base/math.hpp | 27 ++-- include/ginkgo/core/base/matrix_data.hpp | 2 +- .../ginkgo/core/base/precision_dispatch.hpp | 23 ++- include/ginkgo/core/base/types.hpp | 91 +++++++---- include/ginkgo/core/matrix/coo.hpp | 10 +- include/ginkgo/core/matrix/csr.hpp | 13 +- include/ginkgo/core/matrix/dense.hpp | 23 ++- include/ginkgo/core/matrix/diagonal.hpp | 9 +- include/ginkgo/core/matrix/ell.hpp | 9 +- include/ginkgo/core/matrix/fbcsr.hpp | 9 +- include/ginkgo/core/matrix/hybrid.hpp | 10 +- include/ginkgo/core/matrix/sellp.hpp | 9 +- omp/CMakeLists.txt | 1 + omp/components/atomic.hpp | 17 +- omp/matrix/fft_kernels.cpp | 6 +- omp/solver/cb_gmres_kernels.cpp | 2 +- omp/solver/idr_kernels.cpp | 11 +- reference/CMakeLists.txt | 1 + reference/matrix/fft_kernels.cpp | 6 +- reference/solver/cb_gmres_kernels.cpp | 2 +- reference/solver/idr_kernels.cpp | 10 +- 48 files changed, 656 insertions(+), 148 deletions(-) diff --git a/accessor/cuda_helper.hpp b/accessor/cuda_helper.hpp index 30af6b24777..c1e1696acbf 100644 --- a/accessor/cuda_helper.hpp +++ b/accessor/cuda_helper.hpp @@ -57,6 +57,11 @@ struct cuda_type { using type = T; }; +template <> +struct cuda_type { + using type = __half; +}; + // Unpack cv and reference / pointer qualifiers template struct cuda_type { @@ -87,7 +92,7 @@ struct cuda_type { // Transform std::complex to thrust::complex template struct cuda_type> { - using type = thrust::complex; + using type = thrust::complex::type>; }; diff --git a/common/unified/components/fill_array_kernels.cpp b/common/unified/components/fill_array_kernels.cpp index 457d3d368e7..bb6ad681503 100644 --- a/common/unified/components/fill_array_kernels.cpp +++ b/common/unified/components/fill_array_kernels.cpp @@ -61,7 +61,7 @@ void fill_seq_array(std::shared_ptr exec, ValueType* array, size_type n) { run_kernel( - exec, [] GKO_KERNEL(auto idx, auto array) { array[idx] = idx; }, n, + exec, [] GKO_KERNEL(auto idx, auto array) { array[idx] = static_cast(idx); }, n, array); } diff --git a/core/base/extended_float.hpp b/core/base/extended_float.hpp index a337b405c00..f7b25954cff 100644 --- a/core/base/extended_float.hpp +++ b/core/base/extended_float.hpp @@ -340,9 +340,8 @@ class half { {} explicit GKO_ATTRIBUTES half(int val) noexcept - : half(static_cast(val)) { - - } + : half(static_cast(val)) + {} GKO_ATTRIBUTES operator float() const noexcept { @@ -354,12 +353,12 @@ class half { #endif // defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) } -// #if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) -// GKO_ATTRIBUTES operator __half() noexcept -// { -// return reinterpret_cast(*this); -// } -// #endif // defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) + // #if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) + // GKO_ATTRIBUTES operator __half() noexcept + // { + // return reinterpret_cast(*this); + // } + // #endif // defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) GKO_ATTRIBUTES half& operator+=(const float& rhs) @@ -447,32 +446,38 @@ class half { } - GKO_ATTRIBUTES friend half operator+(half lhs, const float& rhs) - { - float flhs = lhs; - flhs += rhs; // reuse compound assignment - return half(flhs); - } + // GKO_ATTRIBUTES friend half operator+(half lhs, const float& rhs) + // { + // float flhs = lhs; + // flhs += rhs; // reuse compound assignment + // return half(flhs); + // } - GKO_ATTRIBUTES friend half operator-(half lhs, const float& rhs) - { - float flhs = lhs; - flhs -= rhs; // reuse compound assignment - return half(flhs); - } + // GKO_ATTRIBUTES friend half operator-(half lhs, const float& rhs) + // { + // float flhs = lhs; + // flhs -= rhs; // reuse compound assignment + // return half(flhs); + // } - GKO_ATTRIBUTES friend half operator*(half lhs, const float& rhs) - { - float flhs = lhs; - flhs *= rhs; // reuse compound assignment - return half(flhs); - } + // GKO_ATTRIBUTES friend half operator*(half lhs, const float& rhs) + // { + // float flhs = lhs; + // flhs *= rhs; // reuse compound assignment + // return half(flhs); + // } - GKO_ATTRIBUTES friend half operator/(half lhs, const float& rhs) + // GKO_ATTRIBUTES friend half operator/(half lhs, const float& rhs) + // { + // float flhs = lhs; + // flhs /= rhs; // reuse compound assignment + // return half(flhs); + // } + + GKO_ATTRIBUTES half& operator=(long long int val) { - float flhs = lhs; - flhs /= rhs; // reuse compound assignment - return half(flhs); + this->float2half(float(val)); + return *this; } GKO_ATTRIBUTES half& operator=(int val) @@ -493,6 +498,12 @@ class half { return *this; } + GKO_ATTRIBUTES half operator-() const + { + auto val = 0.0f - *this; + return half(val); + } + private: using f16_traits = detail::float_traits; using f32_traits = detail::float_traits; @@ -657,6 +668,10 @@ class complex { : complex(static_cast(real), static_cast(imag)) {} + template + explicit complex(const T& real) : complex(static_cast(real)) + {} + template explicit complex(const complex& other) : complex(static_cast(other.real()), @@ -674,6 +689,76 @@ class complex { static_cast(imag_)); } + complex& operator=(const int& __re) + { + real_ = __re; + imag_ = value_type(); + return *this; + } + + complex& operator=(const value_type& __re) + { + real_ = __re; + imag_ = value_type(); + return *this; + } + complex& operator+=(const value_type& __re) + { + real_ += __re; + return *this; + } + complex& operator-=(const value_type& __re) + { + real_ -= __re; + return *this; + } + complex& operator*=(const value_type& __re) + { + real_ *= __re; + imag_ *= __re; + return *this; + } + complex& operator/=(const value_type& __re) + { + real_ /= __re; + imag_ /= __re; + return *this; + } + + template + complex& operator=(const complex<_Xp>& __c) + { + real_ = __c.real(); + imag_ = __c.imag(); + return *this; + } + template + complex& operator+=(const complex<_Xp>& __c) + { + real_ += __c.real(); + imag_ += __c.imag(); + return *this; + } + template + complex& operator-=(const complex<_Xp>& __c) + { + real_ -= __c.real(); + imag_ -= __c.imag(); + return *this; + } + template + complex& operator*=(const complex<_Xp>& __c) + { + *this = *this * complex(__c.real(), __c.imag()); + return *this; + } + template + complex& operator/=(const complex<_Xp>& __c) + { + *this = *this / complex(__c.real(), __c.imag()); + return *this; + } + private: value_type real_; value_type imag_; diff --git a/core/base/mixed_precision_types.hpp b/core/base/mixed_precision_types.hpp index b5c1e37569b..5aa13c2cb66 100644 --- a/core/base/mixed_precision_types.hpp +++ b/core/base/mixed_precision_types.hpp @@ -40,40 +40,103 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #ifdef GINKGO_MIXED_PRECISION + #define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT1(_macro, ...) \ + template _macro(float, half, half, __VA_ARGS__); \ + template _macro(float, half, float, __VA_ARGS__); \ + template _macro(float, half, double, __VA_ARGS__); \ + template _macro(float, float, half, __VA_ARGS__); \ template _macro(float, float, float, __VA_ARGS__); \ template _macro(float, float, double, __VA_ARGS__); \ + template _macro(float, double, half, __VA_ARGS__); \ template _macro(float, double, float, __VA_ARGS__); \ template _macro(float, double, double, __VA_ARGS__) #define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT2(_macro, ...) \ + template _macro(double, half, half, __VA_ARGS__); \ + template _macro(double, half, float, __VA_ARGS__); \ + template _macro(double, half, double, __VA_ARGS__); \ + template _macro(double, float, half, __VA_ARGS__); \ template _macro(double, float, float, __VA_ARGS__); \ template _macro(double, float, double, __VA_ARGS__); \ + template _macro(double, double, half, __VA_ARGS__); \ template _macro(double, double, float, __VA_ARGS__); \ template _macro(double, double, double, __VA_ARGS__) #define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT3(_macro, ...) \ + GKO_ADAPT_CPHF(_macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ + GKO_ADAPT_CPHF(_macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ + GKO_ADAPT_CPHF(_macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ + GKO_ADAPT_CPHF(_macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ template _macro(std::complex, std::complex, \ std::complex, __VA_ARGS__); \ template _macro(std::complex, std::complex, \ std::complex, __VA_ARGS__); \ + GKO_ADAPT_CPHF(_macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ template _macro(std::complex, std::complex, \ std::complex, __VA_ARGS__); \ template _macro(std::complex, std::complex, \ std::complex, __VA_ARGS__) #define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT4(_macro, ...) \ + GKO_ADAPT_CPHF(_macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ + GKO_ADAPT_CPHF(_macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ + GKO_ADAPT_CPHF(_macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ + GKO_ADAPT_CPHF(_macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ template _macro(std::complex, std::complex, \ std::complex, __VA_ARGS__); \ template _macro(std::complex, std::complex, \ std::complex, __VA_ARGS__); \ + GKO_ADAPT_CPHF(_macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ template _macro(std::complex, std::complex, \ std::complex, __VA_ARGS__); \ template _macro(std::complex, std::complex, \ std::complex, __VA_ARGS__) +#define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT5(_macro, ...) \ + template _macro(half, half, half, __VA_ARGS__); \ + template _macro(half, half, float, __VA_ARGS__); \ + template _macro(half, half, double, __VA_ARGS__); \ + template _macro(half, float, half, __VA_ARGS__); \ + template _macro(half, float, float, __VA_ARGS__); \ + template _macro(half, float, double, __VA_ARGS__); \ + template _macro(half, double, half, __VA_ARGS__); \ + template _macro(half, double, float, __VA_ARGS__); \ + template _macro(half, double, double, __VA_ARGS__) + +#define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT6(_macro, ...) \ + GKO_ADAPT_CPHF(_macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ + GKO_ADAPT_CPHF(_macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ + GKO_ADAPT_CPHF(_macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ + GKO_ADAPT_CPHF(_macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ + GKO_ADAPT_CPHF(_macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ + GKO_ADAPT_CPHF(_macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ + GKO_ADAPT_CPHF(_macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ + GKO_ADAPT_CPHF(_macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ + GKO_ADAPT_CPHF(_macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)) + #else + #define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT1(_macro, ...) \ template _macro(float, float, float, __VA_ARGS__) @@ -88,6 +151,14 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. template _macro(std::complex, std::complex, \ std::complex, __VA_ARGS__) +#define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT5(_macro, ...) \ + template _macro(half, half, half, __VA_ARGS__) + +#define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT6(_macro, ...) \ + GKO_ADAPT_CPHF(_macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)) + + #endif @@ -95,7 +166,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT1(_macro, __VA_ARGS__); \ GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT2(_macro, __VA_ARGS__); \ GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT3(_macro, __VA_ARGS__); \ - GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT4(_macro, __VA_ARGS__) + GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT4(_macro, __VA_ARGS__); \ + GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT5(_macro, __VA_ARGS__); \ + GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT6(_macro, __VA_ARGS__) #define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE(_macro) \ @@ -105,18 +178,36 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #ifdef GINKGO_MIXED_PRECISION #define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_2(_macro, ...) \ + template _macro(half, half, __VA_ARGS__); \ + template _macro(half, float, __VA_ARGS__); \ + template _macro(half, double, __VA_ARGS__); \ + template _macro(float, half, __VA_ARGS__); \ template _macro(float, float, __VA_ARGS__); \ template _macro(float, double, __VA_ARGS__); \ + template _macro(double, half, __VA_ARGS__); \ template _macro(double, float, __VA_ARGS__); \ template _macro(double, double, __VA_ARGS__); \ + GKO_ADAPT_CPHF( \ + _macro(std::complex, std::complex, __VA_ARGS__)); \ + GKO_ADAPT_CPHF( \ + _macro(std::complex, std::complex, __VA_ARGS__)); \ + GKO_ADAPT_CPHF( \ + _macro(std::complex, std::complex, __VA_ARGS__)); \ + GKO_ADAPT_CPHF( \ + _macro(std::complex, std::complex, __VA_ARGS__)); \ template _macro(std::complex, std::complex, __VA_ARGS__); \ template _macro(std::complex, std::complex, __VA_ARGS__); \ + GKO_ADAPT_CPHF( \ + _macro(std::complex, std::complex, __VA_ARGS__)); \ template _macro(std::complex, std::complex, __VA_ARGS__); \ template _macro(std::complex, std::complex, __VA_ARGS__) #else #define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_2(_macro, ...) \ + template _macro(half, half, __VA_ARGS__); \ template _macro(float, float, __VA_ARGS__); \ template _macro(double, double, __VA_ARGS__); \ + GKO_ADAPT_CPHF( \ + _macro(std::complex, std::complex, __VA_ARGS__)); \ template _macro(std::complex, std::complex, __VA_ARGS__); \ template _macro(std::complex, std::complex, __VA_ARGS__) #endif diff --git a/core/base/mtx_io.cpp b/core/base/mtx_io.cpp index de4f6ec1e86..c89da00f365 100644 --- a/core/base/mtx_io.cpp +++ b/core/base/mtx_io.cpp @@ -778,19 +778,28 @@ static constexpr uint64 binary_format_magic() { constexpr auto is_int = std::is_same::value; constexpr auto is_long = std::is_same::value; + constexpr auto is_half = std::is_same::value; constexpr auto is_double = std::is_same::value; constexpr auto is_float = std::is_same::value; constexpr auto is_complex_double = std::is_same>::value; constexpr auto is_complex_float = std::is_same>::value; + constexpr auto is_complex_half = + std::is_same>::value; static_assert(is_int || is_long, "invalid storage index type"); - static_assert( - is_double || is_float || is_complex_double || is_complex_float, - "invalid storage value type"); + static_assert(is_half || is_complex_half || is_double || is_float || + is_complex_double || is_complex_float, + "invalid storage value type"); constexpr auto index_bit = is_int ? 'I' : 'L'; constexpr auto value_bit = - is_double ? 'D' : (is_float ? 'S' : (is_complex_double ? 'Z' : 'C')); + is_double + ? 'D' + : (is_float + ? 'S' + : (is_complex_double + ? 'Z' + : (is_complex_float ? 'C' : (is_half ? 'H' : 'X')))); constexpr uint64 shift = 256; constexpr uint64 type_bits = index_bit * shift + value_bit; return 'G' + diff --git a/core/matrix/coo.cpp b/core/matrix/coo.cpp index 6d28cf2f7b7..3438b509983 100644 --- a/core/matrix/coo.cpp +++ b/core/matrix/coo.cpp @@ -144,6 +144,25 @@ void Coo::move_to( } +template +void Coo::convert_to( + Coo>, IndexType>* result) const +{ + result->values_ = this->values_; + result->row_idxs_ = this->row_idxs_; + result->col_idxs_ = this->col_idxs_; + result->set_size(this->get_size()); +} + + +template +void Coo::move_to( + Coo>, IndexType>* result) +{ + this->convert_to(result); +} + + template void Coo::convert_to( Csr* result) const diff --git a/core/matrix/csr.cpp b/core/matrix/csr.cpp index 9a4697c1195..92f4665d828 100644 --- a/core/matrix/csr.cpp +++ b/core/matrix/csr.cpp @@ -258,6 +258,26 @@ void Csr::move_to( } +template +void Csr::convert_to( + Csr>, IndexType>* result) const +{ + result->values_ = this->values_; + result->col_idxs_ = this->col_idxs_; + result->row_ptrs_ = this->row_ptrs_; + result->set_size(this->get_size()); + convert_strategy_helper(result); +} + + +template +void Csr::move_to( + Csr>, IndexType>* result) +{ + this->convert_to(result); +} + + template void Csr::convert_to( Coo* result) const diff --git a/core/matrix/dense.cpp b/core/matrix/dense.cpp index 17dec93c234..703ae70d0b6 100644 --- a/core/matrix/dense.cpp +++ b/core/matrix/dense.cpp @@ -585,6 +585,30 @@ void Dense::move_to(Dense>* result) } +template +void Dense::convert_to( + Dense>>* result) const +{ + if (result->get_size() != this->get_size()) { + result->set_size(this->get_size()); + result->stride_ = stride_; + result->values_.resize_and_reset(result->get_size()[0] * + result->stride_); + } + auto exec = this->get_executor(); + exec->run(dense::make_copy( + this, make_temporary_output_clone(exec, result).get())); +} + + +template +void Dense::move_to( + Dense>>* result) +{ + this->convert_to(result); +} + + template template void Dense::convert_impl(Coo* result) const @@ -1343,7 +1367,8 @@ void gather_mixed_real_complex(Function fn, LinOp* out) #ifdef GINKGO_MIXED_PRECISION using fst_type = matrix::Dense; using snd_type = matrix::Dense>; - run(out, fn); + using trd_type = matrix::Dense>>; + run(out, fn); #else precision_dispatch(fn, out); #endif diff --git a/core/matrix/diagonal.cpp b/core/matrix/diagonal.cpp index 17edfb9cd8b..bbc017ffb3e 100644 --- a/core/matrix/diagonal.cpp +++ b/core/matrix/diagonal.cpp @@ -192,6 +192,22 @@ void Diagonal::move_to(Diagonal>* result) this->convert_to(result); } +template +void Diagonal::convert_to( + Diagonal>>* result) const +{ + result->values_ = this->values_; + result->set_size(this->get_size()); +} + + +template +void Diagonal::move_to( + Diagonal>>* result) +{ + this->convert_to(result); +} + template void Diagonal::convert_to(Csr* result) const diff --git a/core/matrix/ell.cpp b/core/matrix/ell.cpp index 4c859656866..cd9067ac9dc 100644 --- a/core/matrix/ell.cpp +++ b/core/matrix/ell.cpp @@ -202,6 +202,26 @@ void Ell::move_to( } +template +void Ell::convert_to( + Ell>, IndexType>* result) const +{ + result->values_ = this->values_; + result->col_idxs_ = this->col_idxs_; + result->num_stored_elements_per_row_ = this->num_stored_elements_per_row_; + result->stride_ = this->stride_; + result->set_size(this->get_size()); +} + + +template +void Ell::move_to( + Ell>, IndexType>* result) +{ + this->convert_to(result); +} + + template void Ell::convert_to(Dense* result) const { diff --git a/core/matrix/fbcsr.cpp b/core/matrix/fbcsr.cpp index f5494871791..fb57db6979c 100644 --- a/core/matrix/fbcsr.cpp +++ b/core/matrix/fbcsr.cpp @@ -198,6 +198,27 @@ void Fbcsr::move_to( } +template +void Fbcsr::convert_to( + Fbcsr>, IndexType>* const result) const +{ + result->values_ = this->values_; + result->col_idxs_ = this->col_idxs_; + result->row_ptrs_ = this->row_ptrs_; + result->set_size(this->get_size()); + // block sizes are immutable except for assignment/conversion + result->bs_ = this->bs_; +} + + +template +void Fbcsr::move_to( + Fbcsr>, IndexType>* const result) +{ + this->convert_to(result); +} + + template void Fbcsr::convert_to( Dense* const result) const diff --git a/core/matrix/hybrid.cpp b/core/matrix/hybrid.cpp index b49a6241c37..2ea83e5a317 100644 --- a/core/matrix/hybrid.cpp +++ b/core/matrix/hybrid.cpp @@ -181,6 +181,27 @@ void Hybrid::move_to( } +template +void Hybrid::convert_to( + Hybrid>, IndexType>* result) const +{ + this->ell_->convert_to(result->ell_.get()); + this->coo_->convert_to(result->coo_.get()); + // TODO set strategy correctly + // There is no way to correctly clone the strategy like in + // Csr::convert_to + result->set_size(this->get_size()); +} + + +template +void Hybrid::move_to( + Hybrid>, IndexType>* result) +{ + this->convert_to(result); +} + + template void Hybrid::convert_to(Dense* result) const { diff --git a/core/matrix/row_gatherer.cpp b/core/matrix/row_gatherer.cpp index 442b192f07d..f7a23206cda 100644 --- a/core/matrix/row_gatherer.cpp +++ b/core/matrix/row_gatherer.cpp @@ -46,7 +46,8 @@ namespace matrix { template void RowGatherer::apply_impl(const LinOp* in, LinOp* out) const { - run*, const Dense*, + run*, const Dense*, const Dense*, + const Dense>*, const Dense>*, const Dense>*>( in, [&](auto gather) { gather->row_gather(&row_idxs_, out); }); } @@ -55,7 +56,8 @@ template void RowGatherer::apply_impl(const LinOp* alpha, const LinOp* in, const LinOp* beta, LinOp* out) const { - run*, const Dense*, + run*, const Dense*, const Dense*, + const Dense>*, const Dense>*, const Dense>*>( in, [&](auto gather) { gather->row_gather(alpha, &row_idxs_, beta, out); }); diff --git a/core/matrix/sellp.cpp b/core/matrix/sellp.cpp index 7a343d8e97f..8f1bc6c050a 100644 --- a/core/matrix/sellp.cpp +++ b/core/matrix/sellp.cpp @@ -177,6 +177,27 @@ void Sellp::move_to( this->convert_to(result); } +template +void Sellp::convert_to( + Sellp>, IndexType>* result) const +{ + result->values_ = this->values_; + result->col_idxs_ = this->col_idxs_; + result->slice_lengths_ = this->slice_lengths_; + result->slice_sets_ = this->slice_sets_; + result->slice_size_ = this->slice_size_; + result->stride_factor_ = this->stride_factor_; + result->set_size(this->get_size()); +} + + +template +void Sellp::move_to( + Sellp>, IndexType>* result) +{ + this->convert_to(result); +} + template void Sellp::convert_to(Dense* result) const diff --git a/core/multigrid/pgm.cpp b/core/multigrid/pgm.cpp index 5e4ff888034..8959a7abcd5 100644 --- a/core/multigrid/pgm.cpp +++ b/core/multigrid/pgm.cpp @@ -176,7 +176,7 @@ void Pgm::generate() auto abs_mtx = pgm_op->compute_absolute(); // abs_mtx is already real valuetype, so transpose is enough auto weight_mtx = gko::as(abs_mtx->transpose()); - auto half_scalar = initialize>({0.5}, exec); + auto half_scalar = initialize>({half(0.5)}, exec); auto identity = matrix::Identity::create(exec, num_rows); // W = (abs_mtx + transpose(abs_mtx))/2 abs_mtx->apply(half_scalar, identity, half_scalar, weight_mtx); diff --git a/core/preconditioner/jacobi.cpp b/core/preconditioner/jacobi.cpp index f6f3e8018c5..5f8d194fcd6 100644 --- a/core/preconditioner/jacobi.cpp +++ b/core/preconditioner/jacobi.cpp @@ -319,7 +319,7 @@ void Jacobi::generate(const LinOp* system_matrix, ->extract_diagonal_linop()); auto diag_vt = ::gko::detail::temporary_conversion>:: - template create>>( + template create>>( diag.get()); if (!diag_vt) { GKO_NOT_SUPPORTED(system_matrix); diff --git a/core/solver/cb_gmres.cpp b/core/solver/cb_gmres.cpp index be9dbbf0fdb..282295f261b 100644 --- a/core/solver/cb_gmres.cpp +++ b/core/solver/cb_gmres.cpp @@ -514,8 +514,8 @@ void CbGmres::apply_impl(const LinOp* alpha, const LinOp* b, #define GKO_DECLARE_CB_GMRES(_type1) class CbGmres<_type1> #define GKO_DECLARE_CB_GMRES_TRAITS(_type1) \ struct workspace_traits> -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_CB_GMRES); -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_CB_GMRES_TRAITS); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_NO_HALF(GKO_DECLARE_CB_GMRES); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_NO_HALF(GKO_DECLARE_CB_GMRES_TRAITS); } // namespace solver diff --git a/core/solver/multigrid.cpp b/core/solver/multigrid.cpp index 303106fa4f6..32a919013bb 100644 --- a/core/solver/multigrid.cpp +++ b/core/solver/multigrid.cpp @@ -314,7 +314,7 @@ void MultigridState::generate(const LinOp* system_matrix_in, auto next_nrows = mg_level_list.at(i)->get_coarse_op()->get_size()[0]; auto mg_level = mg_level_list.at(i); - run, std::complex>( mg_level, [&, this](auto mg_level, auto i, auto cycle, auto current_nrows, @@ -371,7 +371,7 @@ void MultigridState::run_mg_cycle(multigrid::cycle cycle, size_type level, return; } auto mg_level = multigrid->get_mg_level_list().at(level); - run, std::complex>( mg_level, [&, this](auto mg_level) { using value_type = @@ -516,7 +516,7 @@ void Multigrid::generate() break; } - run, std::complex>( mg_level, [this](auto mg_level, auto index, auto matrix) { @@ -554,7 +554,7 @@ void Multigrid::generate() auto last_mg_level = mg_level_list_.back(); // generate coarsest solver - run, std::complex>( last_mg_level, [this](auto mg_level, auto level, auto matrix) { @@ -640,7 +640,7 @@ void Multigrid::apply_with_initial_guess_impl(const LinOp* b, LinOp* x, b, x); }; auto first_mg_level = this->get_mg_level_list().front(); - run, std::complex>(first_mg_level, lambda, b, x); } @@ -679,7 +679,7 @@ void Multigrid::apply_with_initial_guess_impl(const LinOp* alpha, alpha, b, beta, x); }; auto first_mg_level = this->get_mg_level_list().front(); - run, std::complex>(first_mg_level, lambda, alpha, b, beta, x); } @@ -744,7 +744,7 @@ void Multigrid::apply_dense_impl(const VectorType* b, VectorType* x, auto first_mg_level = this->get_mg_level_list().front(); - run, std::complex>(first_mg_level, lambda, b, x); } diff --git a/core/stop/residual_norm.cpp b/core/stop/residual_norm.cpp index ee02c8042d2..e59364ddff1 100644 --- a/core/stop/residual_norm.cpp +++ b/core/stop/residual_norm.cpp @@ -127,8 +127,8 @@ ResidualNormBase::ResidualNormBase( baseline_{baseline}, system_matrix_{args.system_matrix}, b_{args.b}, - one_{gko::initialize({1}, exec)}, - neg_one_{gko::initialize({-1}, exec)} + one_{gko::initialize({one()}, exec)}, + neg_one_{gko::initialize({-one()}, exec)} { switch (baseline_) { case mode::initial_resnorm: { diff --git a/cuda/CMakeLists.txt b/cuda/CMakeLists.txt index 4c972d2a584..f882c31c1e7 100644 --- a/cuda/CMakeLists.txt +++ b/cuda/CMakeLists.txt @@ -114,6 +114,7 @@ endif() target_compile_options(ginkgo_cuda PRIVATE $<$:${GINKGO_CUDA_COMPILER_FLAGS}>) target_compile_options(ginkgo_cuda PRIVATE $<$:${GINKGO_COMPILER_FLAGS}>) +target_compile_definitions(ginkgo_cuda PRIVATE GINKGO_COMPILE_KERNEL=1) ginkgo_compile_features(ginkgo_cuda) target_compile_definitions(ginkgo_cuda PRIVATE GKO_COMPILING_CUDA) diff --git a/cuda/base/types.hpp b/cuda/base/types.hpp index 90767061ea2..fdfa6953f5b 100644 --- a/cuda/base/types.hpp +++ b/cuda/base/types.hpp @@ -221,7 +221,7 @@ struct cuda_type_impl { template struct cuda_type_impl> { - using type = thrust::complex; + using type = thrust::complex::type>; }; template <> diff --git a/cuda/matrix/fft_kernels.cu b/cuda/matrix/fft_kernels.cu index 31a679df019..f9248df0125 100644 --- a/cuda/matrix/fft_kernels.cu +++ b/cuda/matrix/fft_kernels.cu @@ -151,7 +151,7 @@ void fft(std::shared_ptr exec, handle.execute(b->get_const_values(), x->get_values(), inverse); } -GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE(GKO_DECLARE_FFT_KERNEL); +GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE_NO_HALF(GKO_DECLARE_FFT_KERNEL); template @@ -167,7 +167,7 @@ void fft2(std::shared_ptr exec, handle.execute(b->get_const_values(), x->get_values(), inverse); } -GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE(GKO_DECLARE_FFT2_KERNEL); +GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE_NO_HALF(GKO_DECLARE_FFT2_KERNEL); template @@ -183,7 +183,7 @@ void fft3(std::shared_ptr exec, handle.execute(b->get_const_values(), x->get_values(), inverse); } -GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE(GKO_DECLARE_FFT3_KERNEL); +GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE_NO_HALF(GKO_DECLARE_FFT3_KERNEL); } // namespace fft diff --git a/cuda/solver/cb_gmres_kernels.cu b/cuda/solver/cb_gmres_kernels.cu index 93e791c76e8..796daf39672 100644 --- a/cuda/solver/cb_gmres_kernels.cu +++ b/cuda/solver/cb_gmres_kernels.cu @@ -113,7 +113,8 @@ void initialize(std::shared_ptr exec, as_device_type(stop_status->get_data())); } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_CB_GMRES_INITIALIZE_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_NO_HALF( + GKO_DECLARE_CB_GMRES_INITIALIZE_KERNEL); template diff --git a/cuda/solver/idr_kernels.cu b/cuda/solver/idr_kernels.cu index 10e8a7b2fc3..4d41a79a7ba 100644 --- a/cuda/solver/idr_kernels.cu +++ b/cuda/solver/idr_kernels.cu @@ -100,10 +100,10 @@ void initialize_subspace_vectors(std::shared_ptr exec, auto gen = curand::rand_generator(std::random_device{}(), CURAND_RNG_PSEUDO_DEFAULT, exec->get_stream()); - curand::rand_vector( - gen, - subspace_vectors->get_size()[0] * subspace_vectors->get_stride(), - 0.0, 1.0, subspace_vectors->get_values()); + // curand::rand_vector( + // gen, + // subspace_vectors->get_size()[0] * subspace_vectors->get_stride(), + // 0.0, 1.0, subspace_vectors->get_values()); } } diff --git a/dpcpp/solver/cb_gmres_kernels.dp.cpp b/dpcpp/solver/cb_gmres_kernels.dp.cpp index 9630b8dcb91..fa93b55a903 100644 --- a/dpcpp/solver/cb_gmres_kernels.dp.cpp +++ b/dpcpp/solver/cb_gmres_kernels.dp.cpp @@ -980,7 +980,8 @@ void initialize(std::shared_ptr exec, stop_status->get_data()); } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_CB_GMRES_INITIALIZE_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_NO_HALF( + GKO_DECLARE_CB_GMRES_INITIALIZE_KERNEL); template diff --git a/hip/matrix/csr_kernels.instantiate.hip.cpp b/hip/matrix/csr_kernels.instantiate.hip.cpp index 9a6c29206de..c6c5fe4afe3 100644 --- a/hip/matrix/csr_kernels.instantiate.hip.cpp +++ b/hip/matrix/csr_kernels.instantiate.hip.cpp @@ -62,6 +62,12 @@ GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT3(GKO_DECLARE_CSR_SPMV_KERNEL, GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT4(GKO_DECLARE_CSR_SPMV_KERNEL, int32); // split +GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT5(GKO_DECLARE_CSR_SPMV_KERNEL, + int32); +// split +GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT6(GKO_DECLARE_CSR_SPMV_KERNEL, + int32); +// split GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT1(GKO_DECLARE_CSR_SPMV_KERNEL, int64); // split @@ -73,6 +79,12 @@ GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT3(GKO_DECLARE_CSR_SPMV_KERNEL, // split GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT4(GKO_DECLARE_CSR_SPMV_KERNEL, int64); +// split +GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT5(GKO_DECLARE_CSR_SPMV_KERNEL, + int64); +// split +GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT6(GKO_DECLARE_CSR_SPMV_KERNEL, + int64); // split @@ -88,6 +100,12 @@ GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT3( GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT4( GKO_DECLARE_CSR_ADVANCED_SPMV_KERNEL, int32); // split +GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT5( + GKO_DECLARE_CSR_ADVANCED_SPMV_KERNEL, int32); +// split +GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT6( + GKO_DECLARE_CSR_ADVANCED_SPMV_KERNEL, int32); +// split GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT1( GKO_DECLARE_CSR_ADVANCED_SPMV_KERNEL, int64); // split @@ -99,6 +117,12 @@ GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT3( // split GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT4( GKO_DECLARE_CSR_ADVANCED_SPMV_KERNEL, int64); +// split +GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT5( + GKO_DECLARE_CSR_ADVANCED_SPMV_KERNEL, int64); +// split +GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT6( + GKO_DECLARE_CSR_ADVANCED_SPMV_KERNEL, int64); // split diff --git a/hip/solver/cb_gmres_kernels.hip.cpp b/hip/solver/cb_gmres_kernels.hip.cpp index d47b53f2dfa..7664c456396 100644 --- a/hip/solver/cb_gmres_kernels.hip.cpp +++ b/hip/solver/cb_gmres_kernels.hip.cpp @@ -115,7 +115,8 @@ void initialize(std::shared_ptr exec, as_device_type(stop_status->get_data())); } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_CB_GMRES_INITIALIZE_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_NO_HALF( + GKO_DECLARE_CB_GMRES_INITIALIZE_KERNEL); template diff --git a/include/ginkgo/core/base/math.hpp b/include/ginkgo/core/base/math.hpp index 4687602835b..324f830e790 100644 --- a/include/ginkgo/core/base/math.hpp +++ b/include/ginkgo/core/base/math.hpp @@ -69,6 +69,11 @@ inline half abs(std::complex a) } inline half sqrt(half a) { return half(sqrt(float(a))); } +inline std::complex sqrt(std::complex a) +{ + return std::complex(sqrt(std::complex(a))); +} + } // namespace reference } // namespace kernels @@ -90,6 +95,11 @@ inline half abs(std::complex a) } inline half sqrt(half a) { return half(sqrt(float(a))); } +inline std::complex sqrt(std::complex a) +{ + return std::complex(sqrt(std::complex(a))); +} + } // namespace omp } // namespace kernels @@ -419,21 +429,12 @@ struct next_precision_impl { using type = half; }; -template <> -struct next_precision_impl> { - using type = std::complex; -}; -template <> -struct next_precision_impl> { - using type = std::complex; +template +struct next_precision_impl> { + using type = std::complex::type>; }; -// template -// struct next_precision_impl> { -// using type = std::complex::type>; -// }; - template struct reduce_precision_impl { @@ -538,7 +539,7 @@ using next_precision = typename detail::next_precision_impl::type; * next_precision. */ template -using previous_precision = next_precision; +using previous_precision = next_precision>; /** diff --git a/include/ginkgo/core/base/matrix_data.hpp b/include/ginkgo/core/base/matrix_data.hpp index 57ac0ad5f5b..f823dfc6b76 100644 --- a/include/ginkgo/core/base/matrix_data.hpp +++ b/include/ginkgo/core/base/matrix_data.hpp @@ -67,7 +67,7 @@ template typename std::enable_if::value, ValueType>::type get_rand_value(Distribution&& dist, Generator&& gen) { - return dist(gen); + return ValueType(dist(gen)); } diff --git a/include/ginkgo/core/base/precision_dispatch.hpp b/include/ginkgo/core/base/precision_dispatch.hpp index 1ddc299fed9..574ca19a7d3 100644 --- a/include/ginkgo/core/base/precision_dispatch.hpp +++ b/include/ginkgo/core/base/precision_dispatch.hpp @@ -82,7 +82,13 @@ make_temporary_conversion(Ptr&& matrix) auto result = detail::temporary_conversion< MaybeConstDense>::template create(matrix); if (!result) { - GKO_NOT_SUPPORTED(*matrix); + result = detail::temporary_conversion>:: + template create< + matrix::Dense>>>( + matrix); + if (!result) { + GKO_NOT_SUPPORTED(matrix); + } } return result; } @@ -255,11 +261,14 @@ void mixed_precision_dispatch(Function fn, const LinOp* in, LinOp* out) #ifdef GINKGO_MIXED_PRECISION using fst_type = matrix::Dense; using snd_type = matrix::Dense>; + using trd_type = matrix::Dense>>; if (auto dense_in = dynamic_cast(in)) { if (auto dense_out = dynamic_cast(out)) { fn(dense_in, dense_out); } else if (auto dense_out = dynamic_cast(out)) { fn(dense_in, dense_out); + } else if (auto dense_out = dynamic_cast(out)) { + fn(dense_in, dense_out); } else { GKO_NOT_SUPPORTED(out); } @@ -268,6 +277,18 @@ void mixed_precision_dispatch(Function fn, const LinOp* in, LinOp* out) fn(dense_in, dense_out); } else if (auto dense_out = dynamic_cast(out)) { fn(dense_in, dense_out); + } else if (auto dense_out = dynamic_cast(out)) { + fn(dense_in, dense_out); + } else { + GKO_NOT_SUPPORTED(out); + } + } else if (auto dense_in = dynamic_cast(in)) { + if (auto dense_out = dynamic_cast(out)) { + fn(dense_in, dense_out); + } else if (auto dense_out = dynamic_cast(out)) { + fn(dense_in, dense_out); + } else if (auto dense_out = dynamic_cast(out)) { + fn(dense_in, dense_out); } else { GKO_NOT_SUPPORTED(out); } diff --git a/include/ginkgo/core/base/types.hpp b/include/ginkgo/core/base/types.hpp index a2fd5234cb2..3e83288ff16 100644 --- a/include/ginkgo/core/base/types.hpp +++ b/include/ginkgo/core/base/types.hpp @@ -419,6 +419,15 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, _enable_macro(CudaExecutor, cuda) +#if GINKGO_COMPILE_KERNEL +#define GKO_ADAPT_CPHF(_macro) \ + template <> \ + _macro GKO_NOT_IMPLEMENTED +#else +#define GKO_ADAPT_CPHF(_macro) template _macro +#endif + + /** * Instantiates a template for each non-complex value type compiled by Ginkgo. * @@ -440,6 +449,10 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, template _macro(double) #endif +#define GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE_NO_HALF(_macro) \ + template _macro(float); \ + template _macro(double) + /** * Instantiates a template for each value type compiled by Ginkgo. @@ -458,10 +471,16 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, #else #define GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(_macro) \ GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE(_macro); \ + GKO_ADAPT_CPHF(_macro(std::complex)); \ template _macro(std::complex); \ template _macro(std::complex) #endif +#define GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_NO_HALF(_macro) \ + GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE_NO_HALF(_macro); \ + template _macro(std::complex); \ + template _macro(std::complex) + /** * Instantiates a template for each value and scalar type compiled by Ginkgo. @@ -486,13 +505,15 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, template <> \ _macro(std::complex, double) GKO_NOT_IMPLEMENTED; #else -#define GKO_INSTANTIATE_FOR_EACH_VALUE_AND_SCALAR_TYPE(_macro) \ - template _macro(half, half); \ - template _macro(float, float); \ - template _macro(double, double); \ - template _macro(std::complex, std::complex); \ - template _macro(std::complex, std::complex); \ - template _macro(std::complex, float); \ +#define GKO_INSTANTIATE_FOR_EACH_VALUE_AND_SCALAR_TYPE(_macro) \ + template _macro(half, half); \ + template _macro(float, float); \ + template _macro(double, double); \ + GKO_ADAPT_CPHF(_macro(std::complex, std::complex)); \ + template _macro(std::complex, std::complex); \ + template _macro(std::complex, std::complex); \ + GKO_ADAPT_CPHF(_macro(std::complex, half)); \ + template _macro(std::complex, float); \ template _macro(std::complex, double) #endif @@ -560,8 +581,10 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, #else #define GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(_macro) \ GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_AND_INDEX_TYPE(_macro); \ + GKO_ADAPT_CPHF(_macro(std::complex, int32)); \ template _macro(std::complex, int32); \ template _macro(std::complex, int32); \ + GKO_ADAPT_CPHF(_macro(std::complex, int64)); \ template _macro(std::complex, int64); \ template _macro(std::complex, int64) #endif @@ -631,6 +654,9 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, #define GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(_macro) \ GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE( \ _macro); \ + GKO_ADAPT_CPHF(_macro(std::complex, int32, int32)); \ + GKO_ADAPT_CPHF(_macro(std::complex, int32, int64)); \ + GKO_ADAPT_CPHF(_macro(std::complex, int64, int64)); \ template _macro(std::complex, int32, int32); \ template _macro(std::complex, int32, int64); \ template _macro(std::complex, int64, int64); \ @@ -670,14 +696,18 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, * Should take two arguments `src` and `dst`, which * are replaced by the source and destination value type. */ -#define GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION(_macro) \ - template _macro(float, double); \ - template _macro(double, float); \ - template _macro(half, double); \ - template _macro(double, half); \ - template _macro(float, half); \ - template _macro(half, float); \ - template _macro(std::complex, std::complex); \ +#define GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION(_macro) \ + template _macro(float, double); \ + template _macro(double, float); \ + template _macro(half, double); \ + template _macro(double, half); \ + template _macro(float, half); \ + template _macro(half, float); \ + GKO_ADAPT_CPHF(_macro(std::complex, std::complex)); \ + GKO_ADAPT_CPHF(_macro(std::complex, std::complex)); \ + GKO_ADAPT_CPHF(_macro(std::complex, std::complex)); \ + GKO_ADAPT_CPHF(_macro(std::complex, std::complex)); \ + template _macro(std::complex, std::complex); \ template _macro(std::complex, std::complex) @@ -690,12 +720,13 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, * Should take two arguments `src` and `dst`, which * are replaced by the source and destination value type. */ -#define GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION_OR_COPY(_macro) \ - GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION(_macro); \ - template _macro(half, half); \ - template _macro(float, float); \ - template _macro(double, double); \ - template _macro(std::complex, std::complex); \ +#define GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION_OR_COPY(_macro) \ + GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION(_macro); \ + template _macro(half, half); \ + template _macro(float, float); \ + template _macro(double, double); \ + GKO_ADAPT_CPHF(_macro(std::complex, std::complex)); \ + template _macro(std::complex, std::complex); \ template _macro(std::complex, std::complex) #endif @@ -708,13 +739,15 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, * Should take two arguments, which are replaced by the * value and index types. */ -#define GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_PAIR(_macro) \ - template _macro(half, half); \ - template _macro(float, float); \ - template _macro(double, double); \ - template _macro(std::complex, float); \ - template _macro(std::complex, double); \ - template _macro(std::complex, std::complex); \ +#define GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_PAIR(_macro) \ + template _macro(half, half); \ + template _macro(float, float); \ + template _macro(double, double); \ + GKO_ADAPT_CPHF(_macro(std::complex, half)); \ + template _macro(std::complex, float); \ + template _macro(std::complex, double); \ + GKO_ADAPT_CPHF(_macro(std::complex, std::complex)); \ + template _macro(std::complex, std::complex); \ template _macro(std::complex, std::complex) @@ -737,6 +770,7 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, template _macro(float, float); \ template _macro(double, double); \ template _macro(long double, long double); \ + GKO_ADAPT_CPHF(_macro(std::complex, std::complex)); \ template _macro(std::complex, std::complex); \ template _macro(std::complex, std::complex) @@ -752,6 +786,7 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, template _macro(half); \ template _macro(float); \ template _macro(double); \ + GKO_ADAPT_CPHF(_macro(std::complex)); \ template _macro(std::complex); \ template _macro(std::complex); \ template _macro(size_type); \ diff --git a/include/ginkgo/core/matrix/coo.hpp b/include/ginkgo/core/matrix/coo.hpp index 15662294607..af68b66679d 100644 --- a/include/ginkgo/core/matrix/coo.hpp +++ b/include/ginkgo/core/matrix/coo.hpp @@ -78,6 +78,7 @@ template class Coo : public EnableLinOp>, public EnableCreateMethod>, public ConvertibleTo, IndexType>>, + public ConvertibleTo>, IndexType>>, public ConvertibleTo>, public ConvertibleTo>, public DiagonalExtractable, @@ -110,13 +111,20 @@ class Coo : public EnableLinOp>, using device_mat_data = device_matrix_data; using absolute_type = remove_complex; - friend class Coo, IndexType>; + friend class Coo, IndexType>; + + friend class Coo>, IndexType>; void convert_to( Coo, IndexType>* result) const override; void move_to(Coo, IndexType>* result) override; + void convert_to( + Coo>, IndexType>* result) const override; + + void move_to(Coo>, IndexType>* result) override; + void convert_to(Csr* other) const override; void move_to(Csr* other) override; diff --git a/include/ginkgo/core/matrix/csr.hpp b/include/ginkgo/core/matrix/csr.hpp index 611e5d33c64..610859665fa 100644 --- a/include/ginkgo/core/matrix/csr.hpp +++ b/include/ginkgo/core/matrix/csr.hpp @@ -127,6 +127,8 @@ template class Csr : public EnableLinOp>, public EnableCreateMethod>, public ConvertibleTo, IndexType>>, + public ConvertibleTo< + Csr>, IndexType>>, public ConvertibleTo>, public ConvertibleTo>, public ConvertibleTo>, @@ -716,13 +718,22 @@ class Csr : public EnableLinOp>, index_type max_length_per_row_; }; - friend class Csr, IndexType>; + friend class Csr, IndexType>; + + friend class Csr>, + IndexType>; void convert_to( Csr, IndexType>* result) const override; void move_to(Csr, IndexType>* result) override; + void convert_to(Csr>, IndexType>* + result) const override; + + void move_to(Csr>, IndexType>* + result) override; + void convert_to(Dense* other) const override; void move_to(Dense* other) override; diff --git a/include/ginkgo/core/matrix/dense.hpp b/include/ginkgo/core/matrix/dense.hpp index b65afa31ef0..8c05a5bf61a 100644 --- a/include/ginkgo/core/matrix/dense.hpp +++ b/include/ginkgo/core/matrix/dense.hpp @@ -90,6 +90,15 @@ class SparsityCsr; class Empty {}; +template +using next2_type = next_precision>; + + +// template +// using conditional_type = typename std::conditional< +// std::is_same>::value, Empty, +// Dense>>::type; + /** * Dense is a matrix format which explicitly stores all values of the matrix. * @@ -110,11 +119,7 @@ class Dense : public EnableLinOp>, public EnableCreateMethod>, public ConvertibleTo>>, - public std::conditional< - std::is_same>, - ValueType>::value, - Empty, - ConvertibleTo>>>>, + public ConvertibleTo>>>, public ConvertibleTo>, public ConvertibleTo>, public ConvertibleTo>, @@ -305,6 +310,14 @@ class Dense friend class Dense>; + friend class Dense>>; + + void convert_to(Dense>>* result) + const override; + + void move_to( + Dense>>* result) override; + void convert_to(Dense>* result) const override; void move_to(Dense>* result) override; diff --git a/include/ginkgo/core/matrix/diagonal.hpp b/include/ginkgo/core/matrix/diagonal.hpp index 50febffcfad..414b9b40d43 100644 --- a/include/ginkgo/core/matrix/diagonal.hpp +++ b/include/ginkgo/core/matrix/diagonal.hpp @@ -71,6 +71,7 @@ class Diagonal public ConvertibleTo>, public ConvertibleTo>, public ConvertibleTo>>, + public ConvertibleTo>>>, public Transposable, public WritableToMatrixData, public WritableToMatrixData, @@ -101,7 +102,9 @@ class Diagonal using device_mat_data32 = device_matrix_data; using absolute_type = remove_complex; - friend class Diagonal>; + friend class Diagonal>; + + friend class Diagonal>>; std::unique_ptr transpose() const override; @@ -111,6 +114,10 @@ class Diagonal void move_to(Diagonal>* result) override; + void convert_to(Diagonal>>* result) const override; + + void move_to(Diagonal>>* result) override; + void convert_to(Csr* result) const override; void move_to(Csr* result) override; diff --git a/include/ginkgo/core/matrix/ell.hpp b/include/ginkgo/core/matrix/ell.hpp index afa19f49407..12429084a1b 100644 --- a/include/ginkgo/core/matrix/ell.hpp +++ b/include/ginkgo/core/matrix/ell.hpp @@ -80,6 +80,7 @@ template class Ell : public EnableLinOp>, public EnableCreateMethod>, public ConvertibleTo, IndexType>>, + public ConvertibleTo>, IndexType>>, public ConvertibleTo>, public ConvertibleTo>, public DiagonalExtractable, @@ -93,7 +94,8 @@ class Ell : public EnableLinOp>, friend class Coo; friend class Csr; friend class Ell, IndexType>; - friend class Ell, IndexType>; + friend class Ell, IndexType>; + friend class Ell>, IndexType>; friend class Hybrid; public: @@ -118,6 +120,11 @@ class Ell : public EnableLinOp>, void move_to(Ell, IndexType>* result) override; + void convert_to( + Ell>, IndexType>* result) const override; + + void move_to(Ell>, IndexType>* result) override; + void convert_to(Dense* other) const override; void move_to(Dense* other) override; diff --git a/include/ginkgo/core/matrix/fbcsr.hpp b/include/ginkgo/core/matrix/fbcsr.hpp index b8833d59b7f..d85dc6c1e0a 100644 --- a/include/ginkgo/core/matrix/fbcsr.hpp +++ b/include/ginkgo/core/matrix/fbcsr.hpp @@ -127,6 +127,7 @@ template class Fbcsr : public EnableLinOp>, public EnableCreateMethod>, public ConvertibleTo, IndexType>>, + public ConvertibleTo>, IndexType>>, public ConvertibleTo>, public ConvertibleTo>, public ConvertibleTo>, @@ -175,13 +176,19 @@ class Fbcsr : public EnableLinOp>, using ConvertibleTo>::convert_to; using ConvertibleTo>::move_to; - friend class Fbcsr, IndexType>; + friend class Fbcsr, IndexType>; + friend class Fbcsr>, IndexType>; void convert_to( Fbcsr, IndexType>* result) const override; void move_to(Fbcsr, IndexType>* result) override; + void convert_to( + Fbcsr>, IndexType>* result) const override; + + void move_to(Fbcsr>, IndexType>* result) override; + void convert_to(Dense* other) const override; void move_to(Dense* other) override; diff --git a/include/ginkgo/core/matrix/hybrid.hpp b/include/ginkgo/core/matrix/hybrid.hpp index db65b57b6fb..600a165d7fe 100644 --- a/include/ginkgo/core/matrix/hybrid.hpp +++ b/include/ginkgo/core/matrix/hybrid.hpp @@ -72,6 +72,7 @@ class Hybrid : public EnableLinOp>, public EnableCreateMethod>, public ConvertibleTo, IndexType>>, + public ConvertibleTo>, IndexType>>, public ConvertibleTo>, public ConvertibleTo>, public DiagonalExtractable, @@ -386,13 +387,20 @@ class Hybrid imbalance_bounded_limit strategy_; }; - friend class Hybrid, IndexType>; + friend class Hybrid, IndexType>; + + friend class Hybrid>, IndexType>; void convert_to( Hybrid, IndexType>* result) const override; void move_to(Hybrid, IndexType>* result) override; + void convert_to( + Hybrid>, IndexType>* result) const override; + + void move_to(Hybrid>, IndexType>* result) override; + void convert_to(Dense* other) const override; void move_to(Dense* other) override; diff --git a/include/ginkgo/core/matrix/sellp.hpp b/include/ginkgo/core/matrix/sellp.hpp index 70656152e27..c26685eab9a 100644 --- a/include/ginkgo/core/matrix/sellp.hpp +++ b/include/ginkgo/core/matrix/sellp.hpp @@ -71,6 +71,7 @@ template class Sellp : public EnableLinOp>, public EnableCreateMethod>, public ConvertibleTo, IndexType>>, + public ConvertibleTo>, IndexType>>, public ConvertibleTo>, public ConvertibleTo>, public DiagonalExtractable, @@ -102,13 +103,19 @@ class Sellp : public EnableLinOp>, using device_mat_data = device_matrix_data; using absolute_type = remove_complex; - friend class Sellp, IndexType>; + friend class Sellp, IndexType>; + friend class Sellp>, IndexType>; void convert_to( Sellp, IndexType>* result) const override; void move_to(Sellp, IndexType>* result) override; + void convert_to( + Sellp>, IndexType>* result) const override; + + void move_to(Sellp>, IndexType>* result) override; + void convert_to(Dense* other) const override; void move_to(Dense* other) override; diff --git a/omp/CMakeLists.txt b/omp/CMakeLists.txt index 47259feeac0..2edf676bda1 100644 --- a/omp/CMakeLists.txt +++ b/omp/CMakeLists.txt @@ -75,6 +75,7 @@ target_include_directories(ginkgo_omp PRIVATE "${OpenMP_CXX_INCLUDE_DIRS}") separate_arguments(OpenMP_SEP_FLAGS NATIVE_COMMAND "${OpenMP_CXX_FLAGS}") target_compile_options(ginkgo_omp PRIVATE "${OpenMP_SEP_FLAGS}") target_compile_options(ginkgo_omp PRIVATE "${GINKGO_COMPILER_FLAGS}") +target_compile_definitions(ginkgo_omp PRIVATE GINKGO_COMPILE_KERNEL=1) # Need to link against ginkgo_cuda for the `raw_copy_to(CudaExecutor ...)` method target_link_libraries(ginkgo_omp PRIVATE ginkgo_cuda) diff --git a/omp/components/atomic.hpp b/omp/components/atomic.hpp index 4f8c3fa4e6a..3832d0d85ec 100644 --- a/omp/components/atomic.hpp +++ b/omp/components/atomic.hpp @@ -61,10 +61,8 @@ void atomic_add(ValueType& out, ValueType val) // The C++ standard explicitly allows casting complex* to double* // [complex.numbers.general] auto values = reinterpret_cast*>(&out); -#pragma omp atomic - values[0] += real(val); -#pragma omp atomic - values[1] += imag(val); + atomic_add(values[0], real(val)); + atomic_add(values[1], imag(val)); } @@ -89,13 +87,12 @@ void atomic_add(half& out, half val) assumed = old; auto answer = reinterpret(reinterpret(assumed) + val); #pragma omp atomic capture -{ - old = *address_as_converter; - *address_as_converter = (old == assumed) ? answer : old; -} + { + old = *address_as_converter; + *address_as_converter = (old == assumed) ? answer : old; + } } while (assumed != old); - -} // namespace omp +} } // namespace omp diff --git a/omp/matrix/fft_kernels.cpp b/omp/matrix/fft_kernels.cpp index 2e9f30f3860..1ec950282b2 100644 --- a/omp/matrix/fft_kernels.cpp +++ b/omp/matrix/fft_kernels.cpp @@ -149,7 +149,7 @@ void fft(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE(GKO_DECLARE_FFT_KERNEL); +GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE_NO_HALF(GKO_DECLARE_FFT_KERNEL); template @@ -220,7 +220,7 @@ void fft2(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE(GKO_DECLARE_FFT2_KERNEL); +GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE_NO_HALF(GKO_DECLARE_FFT2_KERNEL); template @@ -325,7 +325,7 @@ void fft3(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE(GKO_DECLARE_FFT3_KERNEL); +GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE_NO_HALF(GKO_DECLARE_FFT3_KERNEL); } // namespace fft diff --git a/omp/solver/cb_gmres_kernels.cpp b/omp/solver/cb_gmres_kernels.cpp index e8fa36556ba..1e60e45d734 100644 --- a/omp/solver/cb_gmres_kernels.cpp +++ b/omp/solver/cb_gmres_kernels.cpp @@ -361,7 +361,7 @@ void initialize(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_CB_GMRES_INITIALIZE_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_NO_HALF(GKO_DECLARE_CB_GMRES_INITIALIZE_KERNEL); template diff --git a/omp/solver/idr_kernels.cpp b/omp/solver/idr_kernels.cpp index 8f4acf8a747..6ae31a1dc27 100644 --- a/omp/solver/idr_kernels.cpp +++ b/omp/solver/idr_kernels.cpp @@ -167,15 +167,16 @@ void initialize(std::shared_ptr exec, const size_type nrhs, // Initialize and Orthonormalize P const auto num_rows = subspace_vectors->get_size()[0]; const auto num_cols = subspace_vectors->get_size()[1]; - auto dist = std::normal_distribution>(0.0, 1.0); + // auto dist = + // std::normal_distribution>(0.0, 1.0); auto seed = std::random_device{}(); auto gen = std::default_random_engine(seed); for (size_type row = 0; row < num_rows; row++) { if (!deterministic) { - for (size_type col = 0; col < num_cols; col++) { - subspace_vectors->at(row, col) = - get_rand_value(dist, gen); - } + // for (size_type col = 0; col < num_cols; col++) { + // subspace_vectors->at(row, col) = + // get_rand_value(dist, gen); + // } } for (size_type i = 0; i < row; i++) { diff --git a/reference/CMakeLists.txt b/reference/CMakeLists.txt index dd54e3fb52f..b857904415e 100644 --- a/reference/CMakeLists.txt +++ b/reference/CMakeLists.txt @@ -71,6 +71,7 @@ if (CMAKE_CXX_COMPILER_ID MATCHES "PGI|NVHPC") set_source_files_properties(preconditioner/jacobi_kernels.cpp PROPERTIES COMPILE_FLAGS "-O1") endif() +target_compile_definitions(ginkgo_reference PRIVATE GINKGO_COMPILE_KERNEL=1) if (GINKGO_CHECK_CIRCULAR_DEPS) ginkgo_check_headers(ginkgo_reference "") endif() diff --git a/reference/matrix/fft_kernels.cpp b/reference/matrix/fft_kernels.cpp index bdf056cf882..a81a4499c64 100644 --- a/reference/matrix/fft_kernels.cpp +++ b/reference/matrix/fft_kernels.cpp @@ -146,7 +146,7 @@ void fft(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE(GKO_DECLARE_FFT_KERNEL); +GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE_NO_HALF(GKO_DECLARE_FFT_KERNEL); template @@ -213,7 +213,7 @@ void fft2(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE(GKO_DECLARE_FFT2_KERNEL); +GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE_NO_HALF(GKO_DECLARE_FFT2_KERNEL); template @@ -313,7 +313,7 @@ void fft3(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE(GKO_DECLARE_FFT3_KERNEL); +GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE_NO_HALF(GKO_DECLARE_FFT3_KERNEL); } // namespace fft diff --git a/reference/solver/cb_gmres_kernels.cpp b/reference/solver/cb_gmres_kernels.cpp index 2df07cf9258..b5dde273796 100644 --- a/reference/solver/cb_gmres_kernels.cpp +++ b/reference/solver/cb_gmres_kernels.cpp @@ -325,7 +325,7 @@ void initialize(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_CB_GMRES_INITIALIZE_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_NO_HALF(GKO_DECLARE_CB_GMRES_INITIALIZE_KERNEL); template diff --git a/reference/solver/idr_kernels.cpp b/reference/solver/idr_kernels.cpp index 15a95ae0711..79ca67866bb 100644 --- a/reference/solver/idr_kernels.cpp +++ b/reference/solver/idr_kernels.cpp @@ -152,15 +152,15 @@ void initialize(std::shared_ptr exec, // Initialize and Orthonormalize P const auto num_rows = subspace_vectors->get_size()[0]; const auto num_cols = subspace_vectors->get_size()[1]; - auto dist = std::normal_distribution>(0.0, 1.0); + // auto dist = std::normal_distribution>(0.0, 1.0); auto seed = std::random_device{}(); auto gen = std::default_random_engine(seed); for (size_type row = 0; row < num_rows; row++) { if (!deterministic) { - for (size_type col = 0; col < num_cols; col++) { - subspace_vectors->at(row, col) = - get_rand_value(dist, gen); - } + // for (size_type col = 0; col < num_cols; col++) { + // subspace_vectors->at(row, col) = + // // get_rand_value(dist, gen); + // } } for (size_type i = 0; i < row; i++) { From c517644fda427cced9aadbcd427888bed627498b Mon Sep 17 00:00:00 2001 From: "Yu-Hsiang M. Tsai" Date: Wed, 11 Jan 2023 16:35:58 -0600 Subject: [PATCH 04/48] compile for cuda/sycl/test/mpi (hip needs trick) --- accessor/hip_helper.hpp | 9 +- common/cuda_hip/components/atomic.hpp.inc | 50 ++- common/cuda_hip/components/warp_blas.hpp.inc | 2 +- .../distributed/matrix_kernels.hpp.inc | 23 +- .../par_ilut_select_kernels.hpp.inc | 2 +- common/unified/multigrid/pgm_kernels.cpp | 10 +- core/base/extended_float.hpp | 318 +++++++----------- core/distributed/matrix.cpp | 39 +++ core/distributed/vector.cpp | 19 ++ core/test/utils.hpp | 9 +- cuda/distributed/matrix_kernels.cu | 1 + cuda/matrix/csr_kernels.template.cu | 5 +- dpcpp/CMakeLists.txt | 1 + dpcpp/components/atomic.dp.hpp | 30 +- dpcpp/components/cooperative_groups.dp.hpp | 6 + .../par_ilut_select_kernels.hpp.inc | 2 +- dpcpp/matrix/csr_kernels.dp.cpp | 8 + dpcpp/matrix/dense_kernels.dp.cpp | 67 ++-- dpcpp/solver/idr_kernels.dp.cpp | 8 +- hip/CMakeLists.txt | 1 + hip/base/types.hip.hpp | 72 +++- hip/components/cooperative_groups.hip.hpp | 10 +- hip/matrix/fft_kernels.hip.cpp | 8 +- .../jacobi_generate_instantiate.inc.hip.cpp | 12 + hip/solver/idr_kernels.hip.cpp | 8 +- include/ginkgo/core/base/math.hpp | 59 ++-- include/ginkgo/core/base/mpi.hpp | 5 + .../ginkgo/core/base/precision_dispatch.hpp | 16 +- include/ginkgo/core/distributed/matrix.hpp | 13 +- include/ginkgo/core/distributed/vector.hpp | 19 +- test/matrix/matrix.cpp | 10 +- test/mpi/matrix.cpp | 12 +- test/mpi/solver/solver.cpp | 10 +- test/solver/solver.cpp | 10 +- 34 files changed, 541 insertions(+), 333 deletions(-) diff --git a/accessor/hip_helper.hpp b/accessor/hip_helper.hpp index 9848b4360f8..5feaa45400b 100644 --- a/accessor/hip_helper.hpp +++ b/accessor/hip_helper.hpp @@ -47,6 +47,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "utils.hpp" +struct __half; + + namespace gko { namespace acc { namespace detail { @@ -83,11 +86,15 @@ struct hip_type { using type = typename hip_type::type&&; }; +template <> +struct hip_type { + using type = __half; +}; // Transform std::complex to thrust::complex template struct hip_type> { - using type = thrust::complex; + using type = thrust::complex::type>; }; diff --git a/common/cuda_hip/components/atomic.hpp.inc b/common/cuda_hip/components/atomic.hpp.inc index 24bce48a720..5394649335e 100644 --- a/common/cuda_hip/components/atomic.hpp.inc +++ b/common/cuda_hip/components/atomic.hpp.inc @@ -110,15 +110,63 @@ __forceinline__ __device__ ResultType reinterpret(ValueType val) } \ }; + +#define GKO_BIND_ATOMIC_HELPER_FAKE_STRUCTURE(CONVERTER_TYPE) \ + template \ + struct atomic_helper< \ + ValueType, \ + std::enable_if_t<(sizeof(ValueType) == sizeof(CONVERTER_TYPE))>> { \ + __forceinline__ __device__ static ValueType atomic_add( \ + ValueType* __restrict__ addr, ValueType val) \ + { \ + using c_type = CONVERTER_TYPE; \ + return atomic_wrapper( \ + addr, [&val](c_type& old, c_type assumed, c_type* c_addr) { \ + old = *c_addr; \ + *c_addr = reinterpret( \ + val + reinterpret(assumed)); \ + }); \ + } \ + __forceinline__ __device__ static ValueType atomic_max( \ + ValueType* __restrict__ addr, ValueType val) \ + { \ + using c_type = CONVERTER_TYPE; \ + return atomic_wrapper( \ + addr, [&val](c_type& old, c_type assumed, c_type* c_addr) { \ + if (reinterpret(assumed) < val) { \ + old = *c_addr; \ + *c_addr = reinterpret(assumed); \ + } \ + }); \ + } \ + \ + private: \ + template \ + __forceinline__ __device__ static ValueType atomic_wrapper( \ + ValueType* __restrict__ addr, Callable set_old) \ + { \ + CONVERTER_TYPE* address_as_converter = \ + reinterpret_cast(addr); \ + CONVERTER_TYPE old = *address_as_converter; \ + CONVERTER_TYPE assumed = old; \ + set_old(old, assumed, address_as_converter); \ + return reinterpret(old); \ + } \ + }; + // Support 64-bit ATOMIC_ADD and ATOMIC_MAX GKO_BIND_ATOMIC_HELPER_STRUCTURE(unsigned long long int); // Support 32-bit ATOMIC_ADD and ATOMIC_MAX GKO_BIND_ATOMIC_HELPER_STRUCTURE(unsigned int); -#if !(defined(CUDA_VERSION) && (CUDA_VERSION < 10010)) +#if !(defined(CUDA_VERSION) && (CUDA_VERSION < 10010)) && \ + !(defined(__HIPCC__) && GINKGO_HIP_PLATFORM_HCC) // CUDA 10.1 starts supporting 16-bit unsigned short int atomicCAS +// required the CC>70 GKO_BIND_ATOMIC_HELPER_STRUCTURE(unsigned short int); +#else +GKO_BIND_ATOMIC_HELPER_FAKE_STRUCTURE(unsigned short int) #endif // !(defined(CUDA_VERSION) && (CUDA_VERSION < 10010)) #undef GKO_BIND_ATOMIC_HELPER_STRUCTURE diff --git a/common/cuda_hip/components/warp_blas.hpp.inc b/common/cuda_hip/components/warp_blas.hpp.inc index 8869dae3e3b..195f65ffd68 100644 --- a/common/cuda_hip/components/warp_blas.hpp.inc +++ b/common/cuda_hip/components/warp_blas.hpp.inc @@ -435,5 +435,5 @@ __device__ __forceinline__ remove_complex compute_infinity_norm( } } return reduce(group, sum, - [](result_type x, result_type y) { return max(x, y); }); + [](result_type x, result_type y) { return gko::max(x, y); }); } diff --git a/common/cuda_hip/distributed/matrix_kernels.hpp.inc b/common/cuda_hip/distributed/matrix_kernels.hpp.inc index 0fa7afab859..7457af72267 100644 --- a/common/cuda_hip/distributed/matrix_kernels.hpp.inc +++ b/common/cuda_hip/distributed/matrix_kernels.hpp.inc @@ -138,11 +138,11 @@ void build_local_nonlocal( col_range_starting_indices[range_id]; }; - using input_type = input_type; + using input_type = input_type, GlobalIndexType>; auto input_it = thrust::make_zip_iterator(thrust::make_tuple( input.get_const_row_idxs(), input.get_const_col_idxs(), - input.get_const_values(), row_range_ids.get_const_data(), - col_range_ids.get_const_data())); + as_device_type(input.get_const_values()), + row_range_ids.get_const_data(), col_range_ids.get_const_data())); // copy and transform local entries into arrays local_row_idxs.resize_and_reset(num_local_elements); @@ -150,17 +150,17 @@ void build_local_nonlocal( local_values.resize_and_reset(num_local_elements); auto local_it = thrust::make_transform_iterator( input_it, [map_to_local_row, map_to_local_col] __host__ __device__( - const input_type input) { - auto local_row = map_to_local_row(input.row, input.row_range); - auto local_col = map_to_local_col(input.col, input.col_range); - return thrust::make_tuple(local_row, local_col, input.val); + const input_type input2) { + auto local_row = map_to_local_row(input2.row, input2.row_range); + auto local_col = map_to_local_col(input2.col, input2.col_range); + return thrust::make_tuple(local_row, local_col, input2.val); }); thrust::copy_if( thrust_policy(exec), local_it, local_it + input.get_num_elems(), range_ids_it, - thrust::make_zip_iterator(thrust::make_tuple(local_row_idxs.get_data(), - local_col_idxs.get_data(), - local_values.get_data())), + thrust::make_zip_iterator(thrust::make_tuple( + local_row_idxs.get_data(), local_col_idxs.get_data(), + as_device_type(local_values.get_data()))), [local_part, row_part_ids, col_part_ids] __host__ __device__( const thrust::tuple& tuple) { auto row_part = row_part_ids[thrust::get<0>(tuple)]; @@ -189,7 +189,8 @@ void build_local_nonlocal( range_ids_it, thrust::make_zip_iterator(thrust::make_tuple( non_local_row_idxs.get_data(), non_local_global_col_idxs.get_data(), - non_local_values.get_data(), non_local_col_part_ids.get_data(), + as_device_type(non_local_values.get_data()), + non_local_col_part_ids.get_data(), non_local_col_range_ids.get_data())), [local_part, row_part_ids, col_part_ids] __host__ __device__( const thrust::tuple& tuple) { diff --git a/common/cuda_hip/factorization/par_ilut_select_kernels.hpp.inc b/common/cuda_hip/factorization/par_ilut_select_kernels.hpp.inc index 2f73d731a69..8adc9329826 100644 --- a/common/cuda_hip/factorization/par_ilut_select_kernels.hpp.inc +++ b/common/cuda_hip/factorization/par_ilut_select_kernels.hpp.inc @@ -269,7 +269,7 @@ __global__ __launch_bounds__(basecase_block_size) void basecase_select( __shared__ ValueType sh_local[basecase_size]; for (int i = 0; i < basecase_local_size; ++i) { auto idx = threadIdx.x + i * basecase_block_size; - local[i] = idx < size ? input[idx] : sentinel; + local[i] = idx < size ? input[idx] : static_cast(sentinel); } bitonic_sort(local, sh_local); if (threadIdx.x == rank / basecase_local_size) { diff --git a/common/unified/multigrid/pgm_kernels.cpp b/common/unified/multigrid/pgm_kernels.cpp index eaf6d20cb3c..82b8853e4b3 100644 --- a/common/unified/multigrid/pgm_kernels.cpp +++ b/common/unified/multigrid/pgm_kernels.cpp @@ -214,7 +214,7 @@ void find_strongest_neighbor( continue; } auto weight = - weight_vals[idx] / max(abs(diag[row]), abs(diag[col])); + weight_vals[idx] / gko::max(abs(diag[row]), abs(diag[col])); if (agg[col] == -1 && device_std::tie(weight, col) > device_std::tie(max_weight_unagg, strongest_unagg)) { @@ -278,8 +278,8 @@ void assign_to_exist_agg(std::shared_ptr exec, if (col == row) { continue; } - auto weight = - weight_vals[idx] / max(abs(diag[row]), abs(diag[col])); + auto weight = weight_vals[idx] / + gko::max(abs(diag[row]), abs(diag[col])); if (agg_const_val[col] != -1 && device_std::tie(weight, col) > device_std::tie(max_weight_agg, strongest_agg)) { @@ -317,8 +317,8 @@ void assign_to_exist_agg(std::shared_ptr exec, if (col == row) { continue; } - auto weight = - weight_vals[idx] / max(abs(diag[row]), abs(diag[col])); + auto weight = weight_vals[idx] / + gko::max(abs(diag[row]), abs(diag[col])); if (agg_val[col] != -1 && device_std::tie(weight, col) > device_std::tie(max_weight_agg, strongest_agg)) { diff --git a/core/base/extended_float.hpp b/core/base/extended_float.hpp index f7b25954cff..82114026d40 100644 --- a/core/base/extended_float.hpp +++ b/core/base/extended_float.hpp @@ -323,25 +323,20 @@ class half { public: GKO_ATTRIBUTES half() noexcept = default; - GKO_ATTRIBUTES half& operator=(const half& val) = default; - GKO_ATTRIBUTES half(const half& val) = default; - // GKO_ATTRIBUTES half(half const&) = default; - // complex() = default; - - // complex(const complex& z) = default; - - explicit GKO_ATTRIBUTES half(float32 val) noexcept + template ::value>> + GKO_ATTRIBUTES half(const T val) { - this->float2half(val); + this->float2half(static_cast(val)); } - explicit GKO_ATTRIBUTES half(float64 val) noexcept - : half(static_cast(val)) - {} + GKO_ATTRIBUTES half(const half& val) = default; - explicit GKO_ATTRIBUTES half(int val) noexcept - : half(static_cast(val)) - {} + template + GKO_ATTRIBUTES half& operator=(const V val) + { + this->float2half(static_cast(val)); + return *this; + } GKO_ATTRIBUTES operator float() const noexcept { @@ -353,151 +348,65 @@ class half { #endif // defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) } - // #if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) - // GKO_ATTRIBUTES operator __half() noexcept - // { - // return reinterpret_cast(*this); - // } - // #endif // defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) - - - GKO_ATTRIBUTES half& operator+=(const float& rhs) - { - auto val = *this + rhs; - this->float2half(val); - return *this; - } - - GKO_ATTRIBUTES half& operator/=(const float& rhs) - { - auto val = *this / rhs; - this->float2half(val); - return *this; - } - - GKO_ATTRIBUTES half& operator*=(const float& rhs) - { - auto val = *this * rhs; - this->float2half(val); - return *this; - } - - GKO_ATTRIBUTES half& operator-=(const float& rhs) - { - auto val = *this - rhs; - this->float2half(val); - return *this; - } - - // half& operator+=(const half& rhs) - // { - // auto val = *this + float(rhs); - // this->float2half(val); - // return *this; - // } - - // half& operator/=(const half& rhs) - // { - // auto val = *this / float(rhs); - // this->float2half(val); - // return *this; - // } - - // half& operator*=(const half& rhs) - // { - // auto val = *this * float(rhs); - // this->float2half(val); - // return *this; - // } - - // half& operator-=(const half& rhs) - // { - // auto val = *this - float(rhs); - // this->float2half(val); - // return *this; - // } - - GKO_ATTRIBUTES friend half operator+(half lhs, const half& rhs) - { - float flhs = lhs; - flhs += rhs; // reuse compound assignment - return half(flhs); - } - - GKO_ATTRIBUTES friend half operator-(half lhs, const half& rhs) - { - float flhs = lhs; - flhs -= rhs; // reuse compound assignment - return half(flhs); - } - - GKO_ATTRIBUTES friend half operator*(half lhs, const half& rhs) - { - float flhs = lhs; - flhs *= rhs; // reuse compound assignment - return half(flhs); - } - - GKO_ATTRIBUTES friend half operator/(half lhs, const half& rhs) - { - float flhs = lhs; - flhs /= rhs; // reuse compound assignment - return half(flhs); - } - - - // GKO_ATTRIBUTES friend half operator+(half lhs, const float& rhs) - // { - // float flhs = lhs; - // flhs += rhs; // reuse compound assignment - // return half(flhs); - // } - - // GKO_ATTRIBUTES friend half operator-(half lhs, const float& rhs) - // { - // float flhs = lhs; - // flhs -= rhs; // reuse compound assignment - // return half(flhs); - // } - - // GKO_ATTRIBUTES friend half operator*(half lhs, const float& rhs) - // { - // float flhs = lhs; - // flhs *= rhs; // reuse compound assignment - // return half(flhs); - // } - - // GKO_ATTRIBUTES friend half operator/(half lhs, const float& rhs) - // { - // float flhs = lhs; - // flhs /= rhs; // reuse compound assignment - // return half(flhs); - // } - - GKO_ATTRIBUTES half& operator=(long long int val) - { - this->float2half(float(val)); - return *this; - } - - GKO_ATTRIBUTES half& operator=(int val) - { - this->float2half(float(val)); - return *this; - } - - GKO_ATTRIBUTES half& operator=(float val) - { - this->float2half(val); - return *this; - } - - GKO_ATTRIBUTES half& operator=(double val) - { - this->float2half(static_cast(val)); - return *this; - } - + // can not use half operator _op(const half) for half + half + // operation will cast it to float and then do float operation such that it + // becomes float in the end. +#define HALF_OPERATOR(_op, _opeq) \ + GKO_ATTRIBUTES friend half operator _op(const half lhf, const half rhf) \ + { \ + return static_cast(static_cast(lhf) \ + _op static_cast(rhf)); \ + } \ + GKO_ATTRIBUTES half& operator _opeq(const half& hf) \ + { \ + auto result = *this _op hf; \ + this->float2half(result); \ + return *this; \ + } + HALF_OPERATOR(+, +=) + HALF_OPERATOR(-, -=) + HALF_OPERATOR(*, *=) + HALF_OPERATOR(/, /=) + + // Do operation with different type + // If it is floating point, using floating point as type. + // If it is integer, using half as type +#define HALF_FRIEND_OPERATOR(_op, _opeq) \ + template \ + GKO_ATTRIBUTES friend std::enable_if_t< \ + !std::is_same::value && std::is_scalar::value, \ + typename std::conditional::value, T, \ + half>::type> \ + operator _op(const half hf, const T val) \ + { \ + using type = \ + typename std::conditional::value, T, \ + half>::type; \ + auto result = static_cast(hf); \ + result _opeq static_cast(val); \ + return result; \ + } \ + template \ + GKO_ATTRIBUTES friend std::enable_if_t< \ + !std::is_same::value && std::is_scalar::value, \ + typename std::conditional::value, T, \ + half>::type> \ + operator _op(const T val, const half hf) \ + { \ + using type = \ + typename std::conditional::value, T, \ + half>::type; \ + auto result = static_cast(hf); \ + result _opeq static_cast(val); \ + return result; \ + } + + HALF_FRIEND_OPERATOR(+, +=) + HALF_FRIEND_OPERATOR(-, -=) + HALF_FRIEND_OPERATOR(*, *=) + HALF_FRIEND_OPERATOR(/, /=) + + // the negative GKO_ATTRIBUTES half operator-() const { auto val = 0.0f - *this; @@ -508,6 +417,8 @@ class half { using f16_traits = detail::float_traits; using f32_traits = detail::float_traits; + // TODO: do we really need this one? + // Without it, everything can be constexpr, which might make stuff easier. GKO_ATTRIBUTES void float2half(float val) noexcept { #if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) @@ -518,7 +429,7 @@ class half { #endif // defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) } - static uint16 float2half(uint32 data_) noexcept + static GKO_ATTRIBUTES uint16 float2half(uint32 data_) noexcept { using conv = detail::precision_converter; if (f32_traits::is_inf(data_)) { @@ -540,7 +451,7 @@ class half { } } - static uint32 half2float(uint16 data_) noexcept + static GKO_ATTRIBUTES uint32 half2float(uint16 data_) noexcept { using conv = detail::precision_converter; if (f16_traits::is_inf(data_)) { @@ -669,7 +580,7 @@ class complex { {} template - explicit complex(const T& real) : complex(static_cast(real)) + complex(const T& real) : complex(static_cast(real)) {} template @@ -689,73 +600,69 @@ class complex { static_cast(imag_)); } - complex& operator=(const int& __re) + template + complex& operator=(const V& val) { - real_ = __re; + real_ = val; imag_ = value_type(); return *this; } - complex& operator=(const value_type& __re) + template + complex& operator=(const std::complex& val) { - real_ = __re; - imag_ = value_type(); + real_ = val.real(); + imag_ = val.imag(); return *this; } - complex& operator+=(const value_type& __re) + + complex& operator+=(const value_type& real) { - real_ += __re; + real_ += real; return *this; } - complex& operator-=(const value_type& __re) + complex& operator-=(const value_type& real) { - real_ -= __re; + real_ -= real; return *this; } - complex& operator*=(const value_type& __re) + complex& operator*=(const value_type& real) { - real_ *= __re; - imag_ *= __re; + real_ *= real; + imag_ *= real; return *this; } - complex& operator/=(const value_type& __re) + complex& operator/=(const value_type& real) { - real_ /= __re; - imag_ /= __re; + real_ /= real; + imag_ /= real; return *this; } - template - complex& operator=(const complex<_Xp>& __c) - { - real_ = __c.real(); - imag_ = __c.imag(); - return *this; - } - template - complex& operator+=(const complex<_Xp>& __c) + template + complex& operator+=(const complex& val) { - real_ += __c.real(); - imag_ += __c.imag(); + real_ += val.real(); + imag_ += val.imag(); return *this; } - template - complex& operator-=(const complex<_Xp>& __c) + template + complex& operator-=(const complex& val) { - real_ -= __c.real(); - imag_ -= __c.imag(); + real_ -= val.real(); + imag_ -= val.imag(); return *this; } - template - complex& operator*=(const complex<_Xp>& __c) + template + complex& operator*=(const complex& val) { - *this = *this * complex(__c.real(), __c.imag()); + *this = *this * complex(val.real(), val.imag()); return *this; } - template - complex& operator/=(const complex<_Xp>& __c) + template + complex& operator/=(const complex& val) { - *this = *this / complex(__c.real(), __c.imag()); + *this = *this / complex(val.real(), val.imag()); return *this; } @@ -796,10 +703,6 @@ class complex> { }; -template <> -struct is_scalar : std::true_type {}; - - template <> struct numeric_limits { static constexpr bool is_specialized{true}; @@ -835,6 +738,15 @@ struct numeric_limits { } }; +template <> +inline complex& complex::operator=( + const std::complex& a) +{ + complex t(a.real(), a.imag()); + operator=(t); + return *this; +} + } // namespace std diff --git a/core/distributed/matrix.cpp b/core/distributed/matrix.cpp index f8de8dbaef0..8c16bbaa2cd 100644 --- a/core/distributed/matrix.cpp +++ b/core/distributed/matrix.cpp @@ -139,6 +139,45 @@ void Matrix::move_to( } +template +void Matrix::convert_to( + Matrix>, local_index_type, + global_index_type>* result) const +{ + GKO_ASSERT(this->get_communicator().size() == + result->get_communicator().size()); + result->local_mtx_->copy_from(this->local_mtx_.get()); + result->non_local_mtx_->copy_from(this->non_local_mtx_.get()); + result->gather_idxs_ = this->gather_idxs_; + result->send_offsets_ = this->send_offsets_; + result->recv_offsets_ = this->recv_offsets_; + result->recv_sizes_ = this->recv_sizes_; + result->send_sizes_ = this->send_sizes_; + result->non_local_to_global_ = this->non_local_to_global_; + result->set_size(this->get_size()); +} + + +template +void Matrix::move_to( + Matrix>, local_index_type, + global_index_type>* result) +{ + GKO_ASSERT(this->get_communicator().size() == + result->get_communicator().size()); + result->local_mtx_->move_from(this->local_mtx_.get()); + result->non_local_mtx_->move_from(this->non_local_mtx_.get()); + result->gather_idxs_ = std::move(this->gather_idxs_); + result->send_offsets_ = std::move(this->send_offsets_); + result->recv_offsets_ = std::move(this->recv_offsets_); + result->recv_sizes_ = std::move(this->recv_sizes_); + result->send_sizes_ = std::move(this->send_sizes_); + result->non_local_to_global_ = std::move(this->non_local_to_global_); + result->set_size(this->get_size()); + this->set_size({}); +} + + template void Matrix::read_distributed( const device_matrix_data& data, diff --git a/core/distributed/vector.cpp b/core/distributed/vector.cpp index 001cf75b76d..f8f4376e217 100644 --- a/core/distributed/vector.cpp +++ b/core/distributed/vector.cpp @@ -290,6 +290,25 @@ void Vector::move_to(Vector>* result) } +template +void Vector::convert_to( + Vector>>* result) const +{ + GKO_ASSERT(this->get_communicator().size() == + result->get_communicator().size()); + result->set_size(this->get_size()); + this->get_local_vector()->convert_to(&result->local_); +} + + +template +void Vector::move_to( + Vector>>* result) +{ + this->convert_to(result); +} + + template std::unique_ptr::absolute_type> Vector::compute_absolute() const diff --git a/core/test/utils.hpp b/core/test/utils.hpp index a16db1eb93a..f2c3b33a02f 100644 --- a/core/test/utils.hpp +++ b/core/test/utils.hpp @@ -214,15 +214,14 @@ template struct reduction_factor { using nc_output = remove_complex; using nc_precision = remove_complex; - static constexpr nc_output value{ - std::numeric_limits::epsilon() * nc_output{10} * - (gko::is_complex() ? nc_output{1.4142} : one())}; + static nc_output value; }; template -constexpr remove_complex - reduction_factor::value; +remove_complex reduction_factor::value = + std::numeric_limits::epsilon() * nc_output{10} * + (gko::is_complex() ? nc_output{1.4142} : one()); } // namespace test diff --git a/cuda/distributed/matrix_kernels.cu b/cuda/distributed/matrix_kernels.cu index b1f5558d69e..f14bee8bbd3 100644 --- a/cuda/distributed/matrix_kernels.cu +++ b/cuda/distributed/matrix_kernels.cu @@ -50,6 +50,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "cuda/base/thrust.cuh" +#include "cuda/base/types.hpp" #include "cuda/components/atomic.cuh" diff --git a/cuda/matrix/csr_kernels.template.cu b/cuda/matrix/csr_kernels.template.cu index 1b4b20a1e75..657b2a3a1ca 100644 --- a/cuda/matrix/csr_kernels.template.cu +++ b/cuda/matrix/csr_kernels.template.cu @@ -258,8 +258,9 @@ void classical_spmv(syn::value_list, exec->get_num_multiprocessor() * classical_oversubscription; const auto gridx = - std::min(ceildiv(a->get_size()[0], spmv_block_size / subwarp_size), - int64(nwarps / warps_in_block)); + ceildiv(a->get_size()[0], spmv_block_size / subwarp_size); + // std::min(ceildiv(a->get_size()[0], spmv_block_size / subwarp_size), + // int64(nwarps / warps_in_block)); const dim3 grid(gridx, b->get_size()[1]); const auto block = spmv_block_size; diff --git a/dpcpp/CMakeLists.txt b/dpcpp/CMakeLists.txt index dd0d7c4cdfb..35b71f7c54d 100644 --- a/dpcpp/CMakeLists.txt +++ b/dpcpp/CMakeLists.txt @@ -110,6 +110,7 @@ target_link_libraries(ginkgo_dpcpp PRIVATE MKL::MKL_DPCPP oneDPL) if (GINKGO_DPCPP_SINGLE_MODE) target_compile_definitions(ginkgo_dpcpp PRIVATE GINKGO_DPCPP_SINGLE_MODE=1) endif() +target_compile_definitions(ginkgo_dpcpp PRIVATE GINKGO_COMPILE_KERNEL=1) ginkgo_default_includes(ginkgo_dpcpp) ginkgo_install_library(ginkgo_dpcpp) diff --git a/dpcpp/components/atomic.dp.hpp b/dpcpp/components/atomic.dp.hpp index d936f78aa94..44b55e3e6bb 100644 --- a/dpcpp/components/atomic.dp.hpp +++ b/dpcpp/components/atomic.dp.hpp @@ -175,6 +175,21 @@ GKO_BIND_ATOMIC_HELPER_STRUCTURE(unsigned long long int); // Support 32-bit ATOMIC_ADD GKO_BIND_ATOMIC_HELPER_STRUCTURE(unsigned int); +// sycl does not support 16bit +template +struct atomic_helper> { + __dpct_inline__ static ValueType atomic_add(ValueType* __restrict__ addr, + ValueType val) + { + // GKO_NOT_IMPLEMENTED; + // wrong implementation because sycl can not use exception in kernel + auto old = *addr; + *addr += val; + return old; + } +}; + #undef GKO_BIND_ATOMIC_HELPER_STRUCTURE @@ -242,7 +257,20 @@ struct atomic_helper< GKO_BIND_ATOMIC_MAX_STRUCTURE(unsigned long long int); // Support 32-bit ATOMIC_ADD GKO_BIND_ATOMIC_MAX_STRUCTURE(unsigned int); - +// not support 16bit +template +struct atomic_max_helper> { + __dpct_inline__ static ValueType atomic_max(ValueType* __restrict__ addr, + ValueType val) + { + // GKO_NOT_IMPLEMENTED; + // wrong implementation because sycl can not use exception in kernel + auto old = *addr; + *addr = std::max(*addr, val); + return old; + } +}; #undef GKO_BIND_ATOMIC_MAX_STRUCTURE diff --git a/dpcpp/components/cooperative_groups.dp.hpp b/dpcpp/components/cooperative_groups.dp.hpp index e2212285954..908b062e692 100644 --- a/dpcpp/components/cooperative_groups.dp.hpp +++ b/dpcpp/components/cooperative_groups.dp.hpp @@ -43,6 +43,12 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "dpcpp/base/config.hpp" #include "dpcpp/base/dpct.hpp" +// namespace sycl { +// namespace detail { +// template <> +// struct is_arithmetic : public std::false_type {}; +// } // namespace detail +// } // namespace sycl namespace gko { namespace kernels { diff --git a/dpcpp/factorization/par_ilut_select_kernels.hpp.inc b/dpcpp/factorization/par_ilut_select_kernels.hpp.inc index 41fa99cc24e..10da0115223 100644 --- a/dpcpp/factorization/par_ilut_select_kernels.hpp.inc +++ b/dpcpp/factorization/par_ilut_select_kernels.hpp.inc @@ -372,7 +372,7 @@ void basecase_select(const ValueType* __restrict__ input, IndexType size, for (int i = 0; i < basecase_local_size; ++i) { auto idx = item_ct1.get_local_id(2) + i * basecase_block_size; - local[i] = idx < size ? input[idx] : sentinel; + local[i] = idx < size ? input[idx] : static_cast(sentinel); } bitonic_sort(local, sh_local, item_ct1); if (item_ct1.get_local_id(2) == rank / basecase_local_size) { diff --git a/dpcpp/matrix/csr_kernels.dp.cpp b/dpcpp/matrix/csr_kernels.dp.cpp index 11309b67b9b..bd84606e7fd 100644 --- a/dpcpp/matrix/csr_kernels.dp.cpp +++ b/dpcpp/matrix/csr_kernels.dp.cpp @@ -1236,6 +1236,14 @@ void load_balance_spmv(std::shared_ptr exec, } } +template +struct onemkl_support : std::false_type {}; + +template <> +struct onemkl_support : std::true_type {}; + +template <> +struct onemkl_support : std::true_type {}; template bool try_general_sparselib_spmv(std::shared_ptr exec, diff --git a/dpcpp/matrix/dense_kernels.dp.cpp b/dpcpp/matrix/dense_kernels.dp.cpp index fba518f387b..86a45e12efa 100644 --- a/dpcpp/matrix/dense_kernels.dp.cpp +++ b/dpcpp/matrix/dense_kernels.dp.cpp @@ -242,6 +242,20 @@ void compute_norm2_dispatch(std::shared_ptr exec, GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( GKO_DECLARE_DENSE_COMPUTE_NORM2_DISPATCH_KERNEL); +template +struct onemkl_support : std::false_type {}; + +template <> +struct onemkl_support : std::true_type {}; + +template <> +struct onemkl_support : std::true_type {}; + +template <> +struct onemkl_support> : std::true_type {}; + +template <> +struct onemkl_support> : std::true_type {}; template void simple_apply(std::shared_ptr exec, @@ -250,17 +264,21 @@ void simple_apply(std::shared_ptr exec, matrix::Dense* c) { using namespace oneapi::mkl; - if (b->get_stride() != 0 && c->get_stride() != 0) { - if (a->get_size()[1] > 0) { - oneapi::mkl::blas::row_major::gemm( - *exec->get_queue(), transpose::nontrans, transpose::nontrans, - c->get_size()[0], c->get_size()[1], a->get_size()[1], - one(), a->get_const_values(), a->get_stride(), - b->get_const_values(), b->get_stride(), zero(), - c->get_values(), c->get_stride()); - } else { - dense::fill(exec, c, zero()); + if constexpr (onemkl_support::value) { + if (b->get_stride() != 0 && c->get_stride() != 0) { + if (a->get_size()[1] > 0) { + oneapi::mkl::blas::row_major::gemm( + *exec->get_queue(), transpose::nontrans, + transpose::nontrans, c->get_size()[0], c->get_size()[1], + a->get_size()[1], one(), a->get_const_values(), + a->get_stride(), b->get_const_values(), b->get_stride(), + zero(), c->get_values(), c->get_stride()); + } else { + dense::fill(exec, c, zero()); + } } + } else { + GKO_NOT_IMPLEMENTED; } } @@ -274,19 +292,24 @@ void apply(std::shared_ptr exec, const matrix::Dense* beta, matrix::Dense* c) { using namespace oneapi::mkl; - if (b->get_stride() != 0 && c->get_stride() != 0) { - if (a->get_size()[1] > 0) { - oneapi::mkl::blas::row_major::gemm( - *exec->get_queue(), transpose::nontrans, transpose::nontrans, - c->get_size()[0], c->get_size()[1], a->get_size()[1], - exec->copy_val_to_host(alpha->get_const_values()), - a->get_const_values(), a->get_stride(), b->get_const_values(), - b->get_stride(), - exec->copy_val_to_host(beta->get_const_values()), - c->get_values(), c->get_stride()); - } else { - dense::scale(exec, beta, c); + if constexpr (onemkl_support::value) { + if (b->get_stride() != 0 && c->get_stride() != 0) { + if (a->get_size()[1] > 0) { + oneapi::mkl::blas::row_major::gemm( + *exec->get_queue(), transpose::nontrans, + transpose::nontrans, c->get_size()[0], c->get_size()[1], + a->get_size()[1], + exec->copy_val_to_host(alpha->get_const_values()), + a->get_const_values(), a->get_stride(), + b->get_const_values(), b->get_stride(), + exec->copy_val_to_host(beta->get_const_values()), + c->get_values(), c->get_stride()); + } else { + dense::scale(exec, beta, c); + } } + } else { + GKO_NOT_IMPLEMENTED; } } diff --git a/dpcpp/solver/idr_kernels.dp.cpp b/dpcpp/solver/idr_kernels.dp.cpp index 3e7b5737f0f..452ffef9c5b 100644 --- a/dpcpp/solver/idr_kernels.dp.cpp +++ b/dpcpp/solver/idr_kernels.dp.cpp @@ -636,11 +636,11 @@ void initialize_subspace_vectors(std::shared_ptr exec, cgh.parallel_for(sycl::range<1>(n), [=](sycl::item<1> idx) { std::uint64_t offset = idx.get_linear_id(); oneapi::dpl::minstd_rand engine(seed, offset); - oneapi::dpl::normal_distribution> - distr(0, 1); - auto res = distr(engine); + // oneapi::dpl::normal_distribution> + // distr(0, 1); + // auto res = distr(engine); - work[idx] = res; + // work[idx] = res; }); }); } diff --git a/hip/CMakeLists.txt b/hip/CMakeLists.txt index 779db13d36a..a28029fc441 100644 --- a/hip/CMakeLists.txt +++ b/hip/CMakeLists.txt @@ -127,6 +127,7 @@ if (GINKGO_HAVE_ROCTX) endif() target_compile_options(ginkgo_hip PRIVATE $<$:${GINKGO_COMPILER_FLAGS}>) +target_compile_definitions(ginkgo_hip PRIVATE GINKGO_COMPILE_KERNEL=1) if(GINKGO_HIP_PLATFORM MATCHES "${HIP_PLATFORM_AMD_REGEX}") find_package(hip REQUIRED) diff --git a/hip/base/types.hip.hpp b/hip/base/types.hip.hpp index c886378ec80..23fd49570a9 100644 --- a/hip/base/types.hip.hpp +++ b/hip/base/types.hip.hpp @@ -54,11 +54,55 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +namespace std { + +template <> +struct is_scalar<__half> : std::true_type {}; + +} // namespace std + + namespace gko { +#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) +// template <> +__device__ __forceinline__ bool is_nan(const __half& val) +{ + return is_nan(float(val)); +} +template <> +GKO_INLINE GKO_ATTRIBUTES constexpr __half abs(const __half& val) +{ + return __habs(val); +} +#endif namespace kernels { namespace hip { + +#if defined(__HIPCC__) +// #endif +// __device__ __half sqrt(__half val) { return hsqrt(val); } +// if directly using above, it will lead all double, float goes to half version +__device__ __half sqrt(__half val) { return hsqrt(val); } +__device__ float sqrt(float val) { return sqrtf(val); } +__device__ double sqrt(double val) { return sqrt(val); } +__device__ thrust::complex sqrt(thrust::complex val) +{ + return thrust::sqrt(val); +} +__device__ thrust::complex sqrt(thrust::complex val) +{ + return thrust::sqrt(val); +} +// template +// __device__ __forceinline__ +// std::enable_if_t::value, __half> +// sqrt(const T& val) +// { +// return hsqrt(val); +// } +#endif namespace detail { @@ -158,6 +202,17 @@ struct hiplibs_type_impl> { using type = hipDoubleComplex; }; +template <> +struct hiplibs_type_impl { + using type = __half; +}; + +template <> +struct hiplibs_type_impl> { + using type = __half2; +}; + + template struct hiplibs_type_impl> { using type = typename hiplibs_type_impl>::type; @@ -230,9 +285,14 @@ struct hip_type_impl { using type = volatile typename hip_type_impl::type; }; +template <> +struct hip_type_impl { + using type = __half; +}; + template struct hip_type_impl> { - using type = thrust::complex; + using type = thrust::complex::type>; }; template <> @@ -245,6 +305,11 @@ struct hip_type_impl { using type = thrust::complex; }; +template <> +struct hip_type_impl<__half2> { + using type = thrust::complex<__half>; +}; + template struct hip_struct_member_type_impl { using type = T; @@ -255,6 +320,11 @@ struct hip_struct_member_type_impl> { using type = fake_complex; }; +template <> +struct hip_struct_member_type_impl { + using type = __half; +}; + template struct hip_type_impl> { using type = diff --git a/hip/components/cooperative_groups.hip.hpp b/hip/components/cooperative_groups.hip.hpp index 647a6f9bc22..f7bc45c087b 100644 --- a/hip/components/cooperative_groups.hip.hpp +++ b/hip/components/cooperative_groups.hip.hpp @@ -335,7 +335,7 @@ class enable_extended_shuffle : public Group { SelectorType selector) const \ { \ return shuffle_impl( \ - [this](uint32 v, SelectorType s) { \ + [this](uint16 v, SelectorType s) { \ return static_cast(this)->_name(v, s); \ }, \ var, selector); \ @@ -355,12 +355,12 @@ class enable_extended_shuffle : public Group { shuffle_impl(ShuffleOperator intrinsic_shuffle, const ValueType var, SelectorType selector) { - static_assert(sizeof(ValueType) % sizeof(uint32) == 0, + static_assert(sizeof(ValueType) % sizeof(uint16) == 0, "Unable to shuffle sizes which are not 4-byte multiples"); - constexpr auto value_size = sizeof(ValueType) / sizeof(uint32); + constexpr auto value_size = sizeof(ValueType) / sizeof(uint16); ValueType result; - auto var_array = reinterpret_cast(&var); - auto result_array = reinterpret_cast(&result); + auto var_array = reinterpret_cast(&var); + auto result_array = reinterpret_cast(&result); #pragma unroll for (std::size_t i = 0; i < value_size; ++i) { result_array[i] = intrinsic_shuffle(var_array[i], selector); diff --git a/hip/matrix/fft_kernels.hip.cpp b/hip/matrix/fft_kernels.hip.cpp index 56c967d9e49..e793663c6b8 100644 --- a/hip/matrix/fft_kernels.hip.cpp +++ b/hip/matrix/fft_kernels.hip.cpp @@ -191,7 +191,7 @@ void fft(std::shared_ptr exec, handle.execute(b->get_const_values(), x->get_values(), inverse); } -GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE(GKO_DECLARE_FFT_KERNEL); +GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE_NO_HALF(GKO_DECLARE_FFT_KERNEL); template @@ -207,7 +207,8 @@ void fft2(std::shared_ptr exec, handle.execute(b->get_const_values(), x->get_values(), inverse); } -GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE(GKO_DECLARE_FFT2_KERNEL); +GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE_NO_HALF( + GKO_DECLARE_FFT2_KERNEL); template @@ -223,7 +224,8 @@ void fft3(std::shared_ptr exec, handle.execute(b->get_const_values(), x->get_values(), inverse); } -GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE(GKO_DECLARE_FFT3_KERNEL); +GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE_NO_HALF( + GKO_DECLARE_FFT3_KERNEL); } // namespace fft diff --git a/hip/preconditioner/jacobi_generate_instantiate.inc.hip.cpp b/hip/preconditioner/jacobi_generate_instantiate.inc.hip.cpp index 8f79cafd427..d68dc5797db 100644 --- a/hip/preconditioner/jacobi_generate_instantiate.inc.hip.cpp +++ b/hip/preconditioner/jacobi_generate_instantiate.inc.hip.cpp @@ -53,6 +53,18 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. namespace gko { +namespace detail { +#if !defined(__HIP_DEVICE_COMPILE__) +template <> +struct basic_float_traits<__half> { + using type = __half; + static constexpr int sign_bits = 1; + static constexpr int significand_bits = 10; + static constexpr int exponent_bits = 5; + static constexpr bool rounds_to_nearest = true; +}; +#endif +} // namespace detail namespace kernels { namespace hip { /** diff --git a/hip/solver/idr_kernels.hip.cpp b/hip/solver/idr_kernels.hip.cpp index 9e6f353abe4..8d106b6c962 100644 --- a/hip/solver/idr_kernels.hip.cpp +++ b/hip/solver/idr_kernels.hip.cpp @@ -102,10 +102,10 @@ void initialize_subspace_vectors(std::shared_ptr exec, auto gen = hiprand::rand_generator(std::random_device{}(), HIPRAND_RNG_PSEUDO_DEFAULT, exec->get_stream()); - hiprand::rand_vector( - gen, - subspace_vectors->get_size()[0] * subspace_vectors->get_stride(), - 0.0, 1.0, subspace_vectors->get_values()); + // hiprand::rand_vector( + // gen, + // subspace_vectors->get_size()[0] * subspace_vectors->get_stride(), + // 0.0, 1.0, subspace_vectors->get_values()); } } diff --git a/include/ginkgo/core/base/math.hpp b/include/ginkgo/core/base/math.hpp index 324f830e790..88bdabe20a6 100644 --- a/include/ginkgo/core/base/math.hpp +++ b/include/ginkgo/core/base/math.hpp @@ -47,6 +47,34 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +// Using SYCL_LANGUAGE_VERSION will lead the mismatch sycl namespace from 6.0.0 +// when using dpcpp compiler without dpcpp module +#if GINKGO_DPCPP_MAJOR_VERSION +#include +#endif + + +namespace std { + + +inline gko::half abs(gko::half a) { return gko::half((a > 0) ? a : -a); } + +inline gko::half abs(std::complex a) +{ + return gko::half(sqrt(float(a.real() * a.real() + a.imag() * a.imag()))); +} + +inline gko::half sqrt(gko::half a) { return gko::half(sqrt(float(a))); } + +inline std::complex sqrt(std::complex a) +{ + return std::complex(sqrt(std::complex(a))); +} + + +} // namespace std + + namespace gko { @@ -62,18 +90,6 @@ using std::abs; using std::sqrt; -inline half abs(half a) { return half((a > 0) ? a : -a); } -inline half abs(std::complex a) -{ - return half(sqrt(float(a.real() * a.real() + a.imag() * a.imag()))); -} -inline half sqrt(half a) { return half(sqrt(float(a))); } - -inline std::complex sqrt(std::complex a) -{ - return std::complex(sqrt(std::complex(a))); -} - } // namespace reference } // namespace kernels @@ -88,19 +104,6 @@ using std::abs; using std::sqrt; -inline half abs(half a) { return half((a > 0) ? a : -a); } -inline half abs(std::complex a) -{ - return half(sqrt(float(a.real() * a.real() + a.imag() * a.imag()))); -} -inline half sqrt(half a) { return half(sqrt(float(a))); } - -inline std::complex sqrt(std::complex a) -{ - return std::complex(sqrt(std::complex(a))); -} - - } // namespace omp } // namespace kernels @@ -205,8 +208,12 @@ struct is_complex_impl> template struct is_complex_or_scalar_impl : std::is_scalar {}; +template <> +struct is_complex_or_scalar_impl : std::true_type {}; + template -struct is_complex_or_scalar_impl> : std::is_scalar {}; +struct is_complex_or_scalar_impl> + : is_complex_or_scalar_impl {}; /** diff --git a/include/ginkgo/core/base/mpi.hpp b/include/ginkgo/core/base/mpi.hpp index bf985cabeb7..0b71bdf6f07 100644 --- a/include/ginkgo/core/base/mpi.hpp +++ b/include/ginkgo/core/base/mpi.hpp @@ -108,6 +108,9 @@ struct type_impl {}; GKO_REGISTER_MPI_TYPE(char, MPI_CHAR); GKO_REGISTER_MPI_TYPE(unsigned char, MPI_UNSIGNED_CHAR); GKO_REGISTER_MPI_TYPE(unsigned, MPI_UNSIGNED); +// OpenMPI 5.0 have support from MPIX_C_FLOAT16 and MPICHv3.4a1 MPIX_C_FLOAT16 +// TODO: it only works on the transferring +GKO_REGISTER_MPI_TYPE(half, MPI_UNSIGNED_SHORT); GKO_REGISTER_MPI_TYPE(int, MPI_INT); GKO_REGISTER_MPI_TYPE(unsigned short, MPI_UNSIGNED_SHORT); GKO_REGISTER_MPI_TYPE(unsigned long, MPI_UNSIGNED_LONG); @@ -117,6 +120,8 @@ GKO_REGISTER_MPI_TYPE(unsigned long long, MPI_UNSIGNED_LONG_LONG); GKO_REGISTER_MPI_TYPE(float, MPI_FLOAT); GKO_REGISTER_MPI_TYPE(double, MPI_DOUBLE); GKO_REGISTER_MPI_TYPE(long double, MPI_LONG_DOUBLE); +// TODO: it only works on the transferring +GKO_REGISTER_MPI_TYPE(std::complex, MPI_FLOAT); GKO_REGISTER_MPI_TYPE(std::complex, MPI_C_FLOAT_COMPLEX); GKO_REGISTER_MPI_TYPE(std::complex, MPI_C_DOUBLE_COMPLEX); diff --git a/include/ginkgo/core/base/precision_dispatch.hpp b/include/ginkgo/core/base/precision_dispatch.hpp index 574ca19a7d3..e6968756b1b 100644 --- a/include/ginkgo/core/base/precision_dispatch.hpp +++ b/include/ginkgo/core/base/precision_dispatch.hpp @@ -389,7 +389,13 @@ make_temporary_conversion(LinOp* matrix) experimental::distributed::Vector>>( matrix); if (!result) { - GKO_NOT_SUPPORTED(matrix); + result = detail::temporary_conversion< + experimental::distributed::Vector>:: + template create>>>(matrix); + if (!result) { + GKO_NOT_SUPPORTED(matrix); + } } return result; } @@ -408,7 +414,13 @@ make_temporary_conversion(const LinOp* matrix) experimental::distributed::Vector>>( matrix); if (!result) { - GKO_NOT_SUPPORTED(matrix); + result = detail::temporary_conversion< + const experimental::distributed::Vector>:: + template create>>>(matrix); + if (!result) { + GKO_NOT_SUPPORTED(matrix); + } } return result; } diff --git a/include/ginkgo/core/distributed/matrix.hpp b/include/ginkgo/core/distributed/matrix.hpp index da91f8f0e60..b3fca57f341 100644 --- a/include/ginkgo/core/distributed/matrix.hpp +++ b/include/ginkgo/core/distributed/matrix.hpp @@ -268,11 +268,15 @@ class Matrix Matrix>, public ConvertibleTo< Matrix, LocalIndexType, GlobalIndexType>>, + public ConvertibleTo>, + LocalIndexType, GlobalIndexType>>, public DistributedBase { friend class EnableCreateMethod; friend class EnableDistributedPolymorphicObject; - friend class Matrix, LocalIndexType, + friend class Matrix, LocalIndexType, GlobalIndexType>; + friend class Matrix>, + LocalIndexType, GlobalIndexType>; public: using value_type = ValueType; @@ -296,6 +300,13 @@ class Matrix void move_to(Matrix, local_index_type, global_index_type>* result) override; + void convert_to( + Matrix>, local_index_type, + global_index_type>* result) const override; + + void move_to(Matrix>, + local_index_type, global_index_type>* result) override; + /** * Reads a square matrix from the device_matrix_data structure and a global * partition. diff --git a/include/ginkgo/core/distributed/vector.hpp b/include/ginkgo/core/distributed/vector.hpp index 61ceab8e380..2547e2da2c9 100644 --- a/include/ginkgo/core/distributed/vector.hpp +++ b/include/ginkgo/core/distributed/vector.hpp @@ -88,13 +88,15 @@ class Vector : public EnableDistributedLinOp>, public EnableCreateMethod>, public ConvertibleTo>>, + public ConvertibleTo>>>, public EnableAbsoluteComputation>>, public DistributedBase { friend class EnableCreateMethod; friend class EnableDistributedPolymorphicObject; friend class Vector>; friend class Vector>; - friend class Vector>; + friend class Vector>; + friend class Vector>>; public: using EnableDistributedLinOp::convert_to; @@ -193,6 +195,12 @@ class Vector void move_to(Vector>* result) override; + void convert_to(Vector>>* result) + const override; + + void move_to( + Vector>>* result) override; + std::unique_ptr compute_absolute() const override; void compute_absolute_inplace() override; @@ -641,12 +649,21 @@ struct conversion_target_helper> { using target_type = experimental::distributed::Vector; using source_type = experimental::distributed::Vector>; + using snd_source_type = experimental::distributed::Vector< + previous_precision>>; static std::unique_ptr create_empty(const source_type* source) { return target_type::create(source->get_executor(), source->get_communicator()); } + + static std::unique_ptr create_empty( + const snd_source_type* source) + { + return target_type::create(source->get_executor(), + source->get_communicator()); + } }; diff --git a/test/matrix/matrix.cpp b/test/matrix/matrix.cpp index 9192b2eeebe..39c27043324 100644 --- a/test/matrix/matrix.cpp +++ b/test/matrix/matrix.cpp @@ -617,10 +617,7 @@ class Matrix : public CommonTestFixture { template gko::matrix_data gen_dense_data(gko::dim<2> size) { - return { - size, - std::normal_distribution>(0.0, 1.0), - rand_engine}; + return {size, std::normal_distribution<>(0.0, 1.0), rand_engine}; } template @@ -640,10 +637,7 @@ class Matrix : public CommonTestFixture { return {gko::initialize( {gko::test::detail::get_rand_value< typename VecType::value_type>( - std::normal_distribution< - gko::remove_complex>( - 0.0, 1.0), - rand_engine)}, + std::normal_distribution<>(0.0, 1.0), rand_engine)}, ref), exec}; } diff --git a/test/mpi/matrix.cpp b/test/mpi/matrix.cpp index 7b72f4aeaab..3ad971bf18a 100644 --- a/test/mpi/matrix.cpp +++ b/test/mpi/matrix.cpp @@ -239,12 +239,10 @@ class Matrix : public CommonMpiTestFixture { alpha = gko::test::generate_random_matrix( 1, 1, std::uniform_int_distribution(1, 1), - std::normal_distribution>(), - this->engine, this->exec); + std::normal_distribution<>(), this->engine, this->exec); beta = gko::test::generate_random_matrix( 1, 1, std::uniform_int_distribution(1, 1), - std::normal_distribution>(), - this->engine, this->exec); + std::normal_distribution<>(), this->engine, this->exec); } void SetUp() override { ASSERT_EQ(comm.size(), 3); } @@ -284,14 +282,12 @@ class Matrix : public CommonMpiTestFixture { num_rows, num_cols, std::uniform_int_distribution(static_cast(num_cols), static_cast(num_cols)), - std::normal_distribution>(), - engine); + std::normal_distribution<>(), engine); auto mat_md = gko::test::generate_random_matrix_data( num_rows, num_rows, std::uniform_int_distribution(0, static_cast(num_rows)), - std::normal_distribution>(), - engine); + std::normal_distribution<>(), engine); auto row_mapping = gko::test::generate_random_array< gko::experimental::distributed::comm_index_type>( diff --git a/test/mpi/solver/solver.cpp b/test/mpi/solver/solver.cpp index 59462a9be59..1494ab117f2 100644 --- a/test/mpi/solver/solver.cpp +++ b/test/mpi/solver/solver.cpp @@ -252,10 +252,7 @@ class Solver : public CommonMpiTestFixture { template gko::matrix_data gen_dense_data(gko::dim<2> size) { - return { - size, - std::normal_distribution>(0.0, 1.0), - rand_engine}; + return {size, std::normal_distribution<>(0.0, 1.0), rand_engine}; } template @@ -282,10 +279,7 @@ class Solver : public CommonMpiTestFixture { { return gko::share(gko::initialize( {gko::test::detail::get_rand_value( - std::normal_distribution< - gko::remove_complex>(0.0, - 1.0), - rand_engine)}, + std::normal_distribution<>(0.0, 1.0), rand_engine)}, exec)); } diff --git a/test/solver/solver.cpp b/test/solver/solver.cpp index b6f228c13f5..30bddf11535 100644 --- a/test/solver/solver.cpp +++ b/test/solver/solver.cpp @@ -589,10 +589,7 @@ class Solver : public CommonTestFixture { template gko::matrix_data gen_dense_data(gko::dim<2> size) { - return { - size, - std::normal_distribution>(0.0, 1.0), - rand_engine}; + return {size, std::normal_distribution<>(0.0, 1.0), rand_engine}; } template @@ -613,10 +610,7 @@ class Solver : public CommonTestFixture { return {gko::initialize( {gko::test::detail::get_rand_value< typename VecType::value_type>( - std::normal_distribution< - gko::remove_complex>( - 0.0, 1.0), - rand_engine)}, + std::normal_distribution<>(0.0, 1.0), rand_engine)}, ref), exec}; } From 6b5a4ec51bcfabcb73df7822dd7a44552a94e48c Mon Sep 17 00:00:00 2001 From: "Yu-Hsiang M. Tsai" Date: Wed, 11 Jan 2023 19:17:28 -0600 Subject: [PATCH 05/48] hip finally --- common/cuda_hip/base/math.hpp.inc | 3 +- cuda/base/math.hpp | 1 + cuda/base/types.hpp | 15 +++----- hip/base/types.hip.hpp | 63 +++++++++++++++++++------------ include/ginkgo/core/base/math.hpp | 6 +++ 5 files changed, 52 insertions(+), 36 deletions(-) diff --git a/common/cuda_hip/base/math.hpp.inc b/common/cuda_hip/base/math.hpp.inc index 583dd01ef93..d533f181222 100644 --- a/common/cuda_hip/base/math.hpp.inc +++ b/common/cuda_hip/base/math.hpp.inc @@ -55,7 +55,8 @@ struct is_complex_impl> template -struct is_complex_or_scalar_impl> : std::is_scalar {}; +struct is_complex_or_scalar_impl> + : is_complex_or_scalar_impl {}; template diff --git a/cuda/base/math.hpp b/cuda/base/math.hpp index 026c570957c..07243caa25f 100644 --- a/cuda/base/math.hpp +++ b/cuda/base/math.hpp @@ -39,6 +39,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include + namespace gko { diff --git a/cuda/base/types.hpp b/cuda/base/types.hpp index fdfa6953f5b..1e3b6f99c5c 100644 --- a/cuda/base/types.hpp +++ b/cuda/base/types.hpp @@ -50,31 +50,26 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include -namespace std { - -template <> -struct is_scalar<__half> : std::true_type {}; - -} // namespace std - namespace gko { -#if defined(__CUDA_ARCH__) + template <> __device__ __forceinline__ bool is_nan(const __half& val) { return is_nan(float(val)); } -#endif + namespace kernels { namespace cuda { + +// __habs only defined when CUDA_ARCH #if defined(__CUDA_ARCH__) -// template <> __device__ __forceinline__ __half abs(const __half& val) { return __habs(val); } __device__ __forceinline__ __half sqrt(const __half& val) { return hsqrt(val); } #endif + namespace detail { /** diff --git a/hip/base/types.hip.hpp b/hip/base/types.hip.hpp index 23fd49570a9..2b35e8bbec8 100644 --- a/hip/base/types.hip.hpp +++ b/hip/base/types.hip.hpp @@ -54,14 +54,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include -namespace std { - -template <> -struct is_scalar<__half> : std::true_type {}; - -} // namespace std - - namespace gko { #if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) // template <> @@ -75,34 +67,55 @@ GKO_INLINE GKO_ATTRIBUTES constexpr __half abs(const __half& val) { return __habs(val); } -#endif -namespace kernels { -namespace hip { +#endif #if defined(__HIPCC__) -// #endif -// __device__ __half sqrt(__half val) { return hsqrt(val); } -// if directly using above, it will lead all double, float goes to half version -__device__ __half sqrt(__half val) { return hsqrt(val); } -__device__ float sqrt(float val) { return sqrtf(val); } -__device__ double sqrt(double val) { return sqrt(val); } -__device__ thrust::complex sqrt(thrust::complex val) +GKO_INLINE +GKO_ATTRIBUTES __half sqrt(__half val) { return hsqrt(val); } +GKO_INLINE +GKO_ATTRIBUTES float sqrt(float val) { return sqrtf(val); } +GKO_INLINE +GKO_ATTRIBUTES double sqrt(double val) { return sqrt(val); } +GKO_INLINE +GKO_ATTRIBUTES thrust::complex sqrt(thrust::complex val) { return thrust::sqrt(val); } -__device__ thrust::complex sqrt(thrust::complex val) +GKO_INLINE +GKO_ATTRIBUTES thrust::complex sqrt(thrust::complex val) { return thrust::sqrt(val); } -// template -// __device__ __forceinline__ -// std::enable_if_t::value, __half> -// sqrt(const T& val) +#endif + +// #if defined(__HIPCC__) +// // #endif +// // __device__ __half sqrt(__half val) { return hsqrt(val); } +// // if directly using above, it will lead all double, float goes to half +// version +// __device__ __half sqrt(__half val) { return hsqrt(val); } +// __device__ float sqrt(float val) { return sqrtf(val); } +// __device__ double sqrt(double val) { return sqrt(val); } +// __device__ thrust::complex sqrt(thrust::complex val) // { -// return hsqrt(val); +// return thrust::sqrt(val); // } -#endif +// __device__ thrust::complex sqrt(thrust::complex val) +// { +// return thrust::sqrt(val); +// } +// // template +// // __device__ __forceinline__ +// // std::enable_if_t::value, __half> +// // sqrt(const T& val) +// // { +// // return hsqrt(val); +// // } +// #endif + +namespace kernels { +namespace hip { namespace detail { diff --git a/include/ginkgo/core/base/math.hpp b/include/ginkgo/core/base/math.hpp index 88bdabe20a6..c83b1ca937f 100644 --- a/include/ginkgo/core/base/math.hpp +++ b/include/ginkgo/core/base/math.hpp @@ -54,6 +54,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif +class __half; + + namespace std { @@ -211,6 +214,9 @@ struct is_complex_or_scalar_impl : std::is_scalar {}; template <> struct is_complex_or_scalar_impl : std::true_type {}; +template <> +struct is_complex_or_scalar_impl<__half> : std::true_type {}; + template struct is_complex_or_scalar_impl> : is_complex_or_scalar_impl {}; From add37ab4365c38a4c1eb6809c1d84c898cbe81b6 Mon Sep 17 00:00:00 2001 From: "Yu-Hsiang M. Tsai" Date: Wed, 11 Jan 2023 20:56:21 -0600 Subject: [PATCH 06/48] fix the narrow issue and atomic support --- common/cuda_hip/components/atomic.hpp.inc | 6 ++-- core/log/papi.cpp | 5 ++- cuda/base/types.hpp | 42 ++++++++++++++++++++-- reference/test/matrix/coo_kernels.cpp | 8 +++-- reference/test/matrix/csr_kernels.cpp | 8 +++-- reference/test/matrix/dense_kernels.cpp | 16 +++++---- reference/test/matrix/diagonal_kernels.cpp | 8 +++-- reference/test/matrix/ell_kernels.cpp | 8 +++-- reference/test/matrix/fbcsr_kernels.cpp | 8 +++-- reference/test/matrix/hybrid_kernels.cpp | 8 +++-- reference/test/matrix/sellp_kernels.cpp | 8 +++-- test/mpi/matrix.cpp | 4 +-- 12 files changed, 99 insertions(+), 30 deletions(-) diff --git a/common/cuda_hip/components/atomic.hpp.inc b/common/cuda_hip/components/atomic.hpp.inc index 5394649335e..5e6e90976ac 100644 --- a/common/cuda_hip/components/atomic.hpp.inc +++ b/common/cuda_hip/components/atomic.hpp.inc @@ -119,6 +119,7 @@ __forceinline__ __device__ ResultType reinterpret(ValueType val) __forceinline__ __device__ static ValueType atomic_add( \ ValueType* __restrict__ addr, ValueType val) \ { \ + assert(false); \ using c_type = CONVERTER_TYPE; \ return atomic_wrapper( \ addr, [&val](c_type& old, c_type assumed, c_type* c_addr) { \ @@ -130,6 +131,7 @@ __forceinline__ __device__ ResultType reinterpret(ValueType val) __forceinline__ __device__ static ValueType atomic_max( \ ValueType* __restrict__ addr, ValueType val) \ { \ + assert(false); \ using c_type = CONVERTER_TYPE; \ return atomic_wrapper( \ addr, [&val](c_type& old, c_type assumed, c_type* c_addr) { \ @@ -161,9 +163,9 @@ GKO_BIND_ATOMIC_HELPER_STRUCTURE(unsigned int); #if !(defined(CUDA_VERSION) && (CUDA_VERSION < 10010)) && \ - !(defined(__HIPCC__) && GINKGO_HIP_PLATFORM_HCC) + (__CUDA_ARCH__ >= 700) && !(defined(__HIPCC__) && GINKGO_HIP_PLATFORM_HCC) // CUDA 10.1 starts supporting 16-bit unsigned short int atomicCAS -// required the CC>70 +// required the CC>=70 GKO_BIND_ATOMIC_HELPER_STRUCTURE(unsigned short int); #else GKO_BIND_ATOMIC_HELPER_FAKE_STRUCTURE(unsigned short int) diff --git a/core/log/papi.cpp b/core/log/papi.cpp index ff1cc1de3d0..3d98e62d0d0 100644 --- a/core/log/papi.cpp +++ b/core/log/papi.cpp @@ -243,15 +243,14 @@ void Papi::on_criterion_check_completed( double residual_norm_d = 0.0; if (residual_norm != nullptr) { auto dense_r_norm = as(residual_norm); - residual_norm_d = - static_cast(std::real(dense_r_norm->at(0, 0))); + residual_norm_d = static_cast(real(dense_r_norm->at(0, 0))); } else if (residual != nullptr) { detail::vector_dispatch(residual, [&](const auto* dense_r) { auto tmp_res_norm = Vector::create( residual->get_executor(), dim<2>{1, residual->get_size()[1]}); dense_r->compute_norm2(tmp_res_norm); residual_norm_d = - static_cast(std::real(tmp_res_norm->at(0, 0))); + static_cast(real(tmp_res_norm->at(0, 0))); }); } diff --git a/cuda/base/types.hpp b/cuda/base/types.hpp index 1e3b6f99c5c..148fdfde2dd 100644 --- a/cuda/base/types.hpp +++ b/cuda/base/types.hpp @@ -53,23 +53,59 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. namespace gko { +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700 + + template <> __device__ __forceinline__ bool is_nan(const __half& val) { - return is_nan(float(val)); + return __hisnan(val); } +#else + + +template <> +__device__ __forceinline__ bool is_nan(const __half& val) +{ + return isnan(static_cast(val)); +} + + +#endif + + namespace kernels { namespace cuda { -// __habs only defined when CUDA_ARCH -#if defined(__CUDA_ARCH__) +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700 + + __device__ __forceinline__ __half abs(const __half& val) { return __habs(val); } + __device__ __forceinline__ __half sqrt(const __half& val) { return hsqrt(val); } + + +#else + + +__device__ __forceinline__ __half abs(const __half& val) +{ + return abs(static_cast(val)); +} + + +__device__ __forceinline__ __half sqrt(const __half& val) +{ + return sqrt(static_cast(val)); +} + + #endif + namespace detail { /** diff --git a/reference/test/matrix/coo_kernels.cpp b/reference/test/matrix/coo_kernels.cpp index 375486cd72f..8a9061eb09b 100644 --- a/reference/test/matrix/coo_kernels.cpp +++ b/reference/test/matrix/coo_kernels.cpp @@ -119,7 +119,9 @@ TYPED_TEST(Coo, ConvertsToPrecision) // If OtherType is more precise: 0, otherwise r auto residual = r::value < r::value ? gko::remove_complex{0} - : gko::remove_complex{r::value}; + : gko::remove_complex{ + static_cast>( + r::value)}; this->mtx->convert_to(tmp); tmp->convert_to(res); @@ -140,7 +142,9 @@ TYPED_TEST(Coo, MovesToPrecision) // If OtherType is more precise: 0, otherwise r auto residual = r::value < r::value ? gko::remove_complex{0} - : gko::remove_complex{r::value}; + : gko::remove_complex{ + static_cast>( + r::value)}; this->mtx->move_to(tmp); tmp->move_to(res); diff --git a/reference/test/matrix/csr_kernels.cpp b/reference/test/matrix/csr_kernels.cpp index d56201ade02..596f103cb21 100644 --- a/reference/test/matrix/csr_kernels.cpp +++ b/reference/test/matrix/csr_kernels.cpp @@ -801,7 +801,9 @@ TYPED_TEST(Csr, ConvertsToPrecision) // If OtherType is more precise: 0, otherwise r auto residual = r::value < r::value ? gko::remove_complex{0} - : gko::remove_complex{r::value}; + : gko::remove_complex{ + static_cast>( + r::value)}; // use mtx2 as mtx's strategy would involve creating a CudaExecutor this->mtx2->convert_to(tmp); @@ -826,7 +828,9 @@ TYPED_TEST(Csr, MovesToPrecision) // If OtherType is more precise: 0, otherwise r auto residual = r::value < r::value ? gko::remove_complex{0} - : gko::remove_complex{r::value}; + : gko::remove_complex{ + static_cast>( + r::value)}; // use mtx2 as mtx's strategy would involve creating a CudaExecutor this->mtx2->move_to(tmp); diff --git a/reference/test/matrix/dense_kernels.cpp b/reference/test/matrix/dense_kernels.cpp index 9edab89e382..a9105279626 100644 --- a/reference/test/matrix/dense_kernels.cpp +++ b/reference/test/matrix/dense_kernels.cpp @@ -749,9 +749,11 @@ TYPED_TEST(Dense, ConvertsToPrecision) auto tmp = OtherDense::create(this->exec); auto res = Dense::create(this->exec); // If OtherT is more precise: 0, otherwise r - auto residual = r::value < r::value - ? gko::remove_complex{0} - : gko::remove_complex{r::value}; + auto residual = + r::value < r::value + ? gko::remove_complex{0} + : gko::remove_complex{ + static_cast>(r::value)}; this->mtx1->convert_to(tmp); tmp->convert_to(res); @@ -769,9 +771,11 @@ TYPED_TEST(Dense, MovesToPrecision) auto tmp = OtherDense::create(this->exec); auto res = Dense::create(this->exec); // If OtherT is more precise: 0, otherwise r - auto residual = r::value < r::value - ? gko::remove_complex{0} - : gko::remove_complex{r::value}; + auto residual = + r::value < r::value + ? gko::remove_complex{0} + : gko::remove_complex{ + static_cast>(r::value)}; this->mtx1->move_to(tmp); tmp->move_to(res); diff --git a/reference/test/matrix/diagonal_kernels.cpp b/reference/test/matrix/diagonal_kernels.cpp index f8803916363..4a043555a08 100644 --- a/reference/test/matrix/diagonal_kernels.cpp +++ b/reference/test/matrix/diagonal_kernels.cpp @@ -125,7 +125,9 @@ TYPED_TEST(Diagonal, ConvertsToPrecision) // If OtherType is more precise: 0, otherwise r auto residual = r::value < r::value ? gko::remove_complex{0} - : gko::remove_complex{r::value}; + : gko::remove_complex{ + static_cast>( + r::value)}; this->diag1->convert_to(tmp); tmp->convert_to(res); @@ -145,7 +147,9 @@ TYPED_TEST(Diagonal, MovesToPrecision) // If OtherType is more precise: 0, otherwise r auto residual = r::value < r::value ? gko::remove_complex{0} - : gko::remove_complex{r::value}; + : gko::remove_complex{ + static_cast>( + r::value)}; this->diag1->move_to(tmp); tmp->move_to(res); diff --git a/reference/test/matrix/ell_kernels.cpp b/reference/test/matrix/ell_kernels.cpp index 135607230a5..b76487b9fb2 100644 --- a/reference/test/matrix/ell_kernels.cpp +++ b/reference/test/matrix/ell_kernels.cpp @@ -484,7 +484,9 @@ TYPED_TEST(Ell, ConvertsToPrecision) // If OtherType is more precise: 0, otherwise r auto residual = r::value < r::value ? gko::remove_complex{0} - : gko::remove_complex{r::value}; + : gko::remove_complex{ + static_cast>( + r::value)}; this->mtx1->convert_to(tmp); tmp->convert_to(res); @@ -505,7 +507,9 @@ TYPED_TEST(Ell, MovesToPrecision) // If OtherType is more precise: 0, otherwise r auto residual = r::value < r::value ? gko::remove_complex{0} - : gko::remove_complex{r::value}; + : gko::remove_complex{ + static_cast>( + r::value)}; this->mtx1->move_to(tmp); tmp->move_to(res); diff --git a/reference/test/matrix/fbcsr_kernels.cpp b/reference/test/matrix/fbcsr_kernels.cpp index e5b948df11a..95a0d2db6ff 100644 --- a/reference/test/matrix/fbcsr_kernels.cpp +++ b/reference/test/matrix/fbcsr_kernels.cpp @@ -311,7 +311,9 @@ TYPED_TEST(Fbcsr, ConvertsToPrecision) // If OtherType is more precise: 0, otherwise r auto residual = r::value < r::value ? gko::remove_complex{0} - : gko::remove_complex{r::value}; + : gko::remove_complex{ + static_cast>( + r::value)}; this->mtx->convert_to(tmp); tmp->convert_to(res); @@ -332,7 +334,9 @@ TYPED_TEST(Fbcsr, MovesToPrecision) // If OtherType is more precise: 0, otherwise r auto residual = r::value < r::value ? gko::remove_complex{0} - : gko::remove_complex{r::value}; + : gko::remove_complex{ + static_cast>( + r::value)}; this->mtx->move_to(tmp); tmp->move_to(res); diff --git a/reference/test/matrix/hybrid_kernels.cpp b/reference/test/matrix/hybrid_kernels.cpp index c234fe0179b..85086b334c5 100644 --- a/reference/test/matrix/hybrid_kernels.cpp +++ b/reference/test/matrix/hybrid_kernels.cpp @@ -273,7 +273,9 @@ TYPED_TEST(Hybrid, ConvertsToPrecision) // If OtherType is more precise: 0, otherwise r auto residual = r::value < r::value ? gko::remove_complex{0} - : gko::remove_complex{r::value}; + : gko::remove_complex{ + static_cast>( + r::value)}; this->mtx1->convert_to(tmp); tmp->convert_to(res); @@ -294,7 +296,9 @@ TYPED_TEST(Hybrid, MovesToPrecision) // If OtherType is more precise: 0, otherwise r auto residual = r::value < r::value ? gko::remove_complex{0} - : gko::remove_complex{r::value}; + : gko::remove_complex{ + static_cast>( + r::value)}; this->mtx1->move_to(tmp); tmp->move_to(res); diff --git a/reference/test/matrix/sellp_kernels.cpp b/reference/test/matrix/sellp_kernels.cpp index a5697fd1ce9..1fb65e940c4 100644 --- a/reference/test/matrix/sellp_kernels.cpp +++ b/reference/test/matrix/sellp_kernels.cpp @@ -228,7 +228,9 @@ TYPED_TEST(Sellp, ConvertsToPrecision) // If OtherType is more precise: 0, otherwise r auto residual = r::value < r::value ? gko::remove_complex{0} - : gko::remove_complex{r::value}; + : gko::remove_complex{ + static_cast>( + r::value)}; this->mtx1->convert_to(tmp); tmp->convert_to(res); @@ -249,7 +251,9 @@ TYPED_TEST(Sellp, MovesToPrecision) // If OtherType is more precise: 0, otherwise r auto residual = r::value < r::value ? gko::remove_complex{0} - : gko::remove_complex{r::value}; + : gko::remove_complex{ + static_cast>( + r::value)}; this->mtx1->move_to(tmp); tmp->move_to(res); diff --git a/test/mpi/matrix.cpp b/test/mpi/matrix.cpp index 3ad971bf18a..5d12ae9afa7 100644 --- a/test/mpi/matrix.cpp +++ b/test/mpi/matrix.cpp @@ -446,7 +446,7 @@ TYPED_TEST(Matrix, CanConvertToNextPrecision) // If OtherT is more precise: 0, otherwise r auto residual = r::value < r::value ? gko::remove_complex{0} - : gko::remove_complex{r::value}; + : gko::remove_complex{static_cast>(r::value)}; this->dist_mat->convert_to(tmp); tmp->convert_to(res); @@ -473,7 +473,7 @@ TYPED_TEST(Matrix, CanMoveToNextPrecision) // If OtherT is more precise: 0, otherwise r auto residual = r::value < r::value ? gko::remove_complex{0} - : gko::remove_complex{r::value}; + : gko::remove_complex{static_cast>(r::value)}; this->dist_mat->move_to(tmp); tmp->convert_to(res); From 2f53fcecfa50b2268e9307c0984efc535bb72edb Mon Sep 17 00:00:00 2001 From: "Yu-Hsiang M. Tsai" Date: Wed, 11 Jan 2023 23:29:26 -0600 Subject: [PATCH 07/48] fixed more error --- cuda/components/cooperative_groups.cuh | 12 ++-- cuda/solver/common_trs_kernels.cuh | 10 ++-- hip/base/types.hip.hpp | 77 +++++++++++++------------- 3 files changed, 48 insertions(+), 51 deletions(-) diff --git a/cuda/components/cooperative_groups.cuh b/cuda/components/cooperative_groups.cuh index db59a47658d..0cd2e9688a1 100644 --- a/cuda/components/cooperative_groups.cuh +++ b/cuda/components/cooperative_groups.cuh @@ -332,7 +332,7 @@ public: SelectorType selector) const \ { \ return shuffle_impl( \ - [this](uint32 v, SelectorType s) { \ + [this](uint16 v, SelectorType s) { \ return static_cast(this)->_name(v, s); \ }, \ var, selector); \ @@ -352,12 +352,12 @@ private: shuffle_impl(ShuffleOperator intrinsic_shuffle, const ValueType var, SelectorType selector) { - static_assert(sizeof(ValueType) % sizeof(uint32) == 0, - "Unable to shuffle sizes which are not 4-byte multiples"); - constexpr auto value_size = sizeof(ValueType) / sizeof(uint32); + static_assert(sizeof(ValueType) % sizeof(uint16) == 0, + "Unable to shuffle sizes which are not 2-byte multiples"); + constexpr auto value_size = sizeof(ValueType) / sizeof(uint16); ValueType result; - auto var_array = reinterpret_cast(&var); - auto result_array = reinterpret_cast(&result); + auto var_array = reinterpret_cast(&var); + auto result_array = reinterpret_cast(&result); #pragma unroll for (std::size_t i = 0; i < value_size; ++i) { result_array[i] = intrinsic_shuffle(var_array[i], selector); diff --git a/cuda/solver/common_trs_kernels.cuh b/cuda/solver/common_trs_kernels.cuh index fa6e1de79fa..97587a7d2cd 100644 --- a/cuda/solver/common_trs_kernels.cuh +++ b/cuda/solver/common_trs_kernels.cuh @@ -240,14 +240,14 @@ struct CudaSolveStruct : gko::solver::SolveStruct { policy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; size_type work_size{}; - + // In nullptr is considered nullptr_t not casted to const ValueType* cusparse::buffer_size_ext( handle, algorithm, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_TRANSPOSE, matrix->get_size()[0], num_rhs, matrix->get_num_stored_elements(), one(), factor_descr, matrix->get_const_values(), matrix->get_const_row_ptrs(), - matrix->get_const_col_idxs(), nullptr, num_rhs, solve_info, policy, - &work_size); + matrix->get_const_col_idxs(), (const ValueType*)(nullptr), num_rhs, + solve_info, policy, &work_size); // allocate workspace work.resize_and_reset(work_size); @@ -257,8 +257,8 @@ struct CudaSolveStruct : gko::solver::SolveStruct { CUSPARSE_OPERATION_TRANSPOSE, matrix->get_size()[0], num_rhs, matrix->get_num_stored_elements(), one(), factor_descr, matrix->get_const_values(), matrix->get_const_row_ptrs(), - matrix->get_const_col_idxs(), nullptr, num_rhs, solve_info, policy, - work.get_data()); + matrix->get_const_col_idxs(), (const ValueType*)(nullptr), num_rhs, + solve_info, policy, work.get_data()); } void solve(const matrix::Csr* matrix, diff --git a/hip/base/types.hip.hpp b/hip/base/types.hip.hpp index 2b35e8bbec8..23f8fc55fc2 100644 --- a/hip/base/types.hip.hpp +++ b/hip/base/types.hip.hpp @@ -55,64 +55,61 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. namespace gko { -#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) -// template <> +#if defined(__CUDA_ARCH__) +#if __CUDA_ARCH__ >= 700 __device__ __forceinline__ bool is_nan(const __half& val) { - return is_nan(float(val)); + return __hisnan(val); } -template <> -GKO_INLINE GKO_ATTRIBUTES constexpr __half abs(const __half& val) +__device__ __forceinline__ __half abs(const __half& val) { return __habs(val); } +#else +__device__ __forceinline__ bool is_nan(const __half& val) { - return __habs(val); + return is_nan(static_cast(val)); } +__device__ __forceinline__ __half abs(const __half& val) +{ + return abs(static_cast(val)); +} +#endif + +#elif defined(__HIP_DEVICE_COMPILE__) +__device__ __forceinline__ bool is_nan(const __half& val) +{ + return __hisnan(val); +} + +// rocm40 __habs is not constexpr +__device__ __forceinline__ __half abs(const __half& val) { return __habs(val); } + #endif #if defined(__HIPCC__) -GKO_INLINE -GKO_ATTRIBUTES __half sqrt(__half val) { return hsqrt(val); } -GKO_INLINE -GKO_ATTRIBUTES float sqrt(float val) { return sqrtf(val); } -GKO_INLINE -GKO_ATTRIBUTES double sqrt(double val) { return sqrt(val); } -GKO_INLINE -GKO_ATTRIBUTES thrust::complex sqrt(thrust::complex val) +__device__ __forceinline__ float sqrt(float val) { return sqrtf(val); } +__device__ __forceinline__ double sqrt(double val) { return sqrt(val); } +__device__ __forceinline__ thrust::complex sqrt( + thrust::complex val) { return thrust::sqrt(val); } -GKO_INLINE -GKO_ATTRIBUTES thrust::complex sqrt(thrust::complex val) +__device__ __forceinline__ thrust::complex sqrt( + thrust::complex val) { return thrust::sqrt(val); } + +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 700 +__device__ __forceinline__ __half sqrt(__half val) +{ + return sqrt(static_cast(val)); +} +#else +__device__ __forceinline__ __half sqrt(__half val) { return hsqrt(val); } +#endif #endif -// #if defined(__HIPCC__) -// // #endif -// // __device__ __half sqrt(__half val) { return hsqrt(val); } -// // if directly using above, it will lead all double, float goes to half -// version -// __device__ __half sqrt(__half val) { return hsqrt(val); } -// __device__ float sqrt(float val) { return sqrtf(val); } -// __device__ double sqrt(double val) { return sqrt(val); } -// __device__ thrust::complex sqrt(thrust::complex val) -// { -// return thrust::sqrt(val); -// } -// __device__ thrust::complex sqrt(thrust::complex val) -// { -// return thrust::sqrt(val); -// } -// // template -// // __device__ __forceinline__ -// // std::enable_if_t::value, __half> -// // sqrt(const T& val) -// // { -// // return hsqrt(val); -// // } -// #endif namespace kernels { namespace hip { From cdc4d6b5b697653ca6cde3736d296a578ed6e77e Mon Sep 17 00:00:00 2001 From: "Yu-Hsiang M. Tsai" Date: Thu, 12 Jan 2023 08:34:02 -0600 Subject: [PATCH 08/48] fix the op order and gdb Co-authored-by: Marcel Koch --- core/base/extended_float.hpp | 4 ++-- dev_tools/scripts/gdb-ginkgo.py | 41 ++++++++++++++++++++++++++++----- 2 files changed, 37 insertions(+), 8 deletions(-) diff --git a/core/base/extended_float.hpp b/core/base/extended_float.hpp index 82114026d40..1bcfa76e818 100644 --- a/core/base/extended_float.hpp +++ b/core/base/extended_float.hpp @@ -396,8 +396,8 @@ class half { using type = \ typename std::conditional::value, T, \ half>::type; \ - auto result = static_cast(hf); \ - result _opeq static_cast(val); \ + auto result = static_cast(val); \ + result _opeq static_cast(hf); \ return result; \ } diff --git a/dev_tools/scripts/gdb-ginkgo.py b/dev_tools/scripts/gdb-ginkgo.py index c028e72994e..2c52af6452f 100644 --- a/dev_tools/scripts/gdb-ginkgo.py +++ b/dev_tools/scripts/gdb-ginkgo.py @@ -51,6 +51,7 @@ def next(self): _versioned_namespace = '__8::' + # new version adapted from https://gcc.gnu.org/pipermail/gcc-cvs/2021-November/356230.html # necessary due to empty class optimization def is_specialization_of(x, template_name): @@ -64,6 +65,7 @@ def is_specialization_of(x, template_name): expr = '^std::{}<.*>$'.format(template_name) return re.match(expr, x) is not None + def get_template_arg_list(type_obj): "Return a type's template arguments as a list" n = 0 @@ -75,6 +77,7 @@ def get_template_arg_list(type_obj): return template_args n += 1 + def _tuple_impl_get(val): "Return the tuple element stored in a _Tuple_impl base class." bases = val.type.fields() @@ -95,6 +98,7 @@ def _tuple_impl_get(val): else: raise ValueError("Unsupported implementation for std::tuple: %s" % str(val.type)) + def tuple_get(n, val): "Return the result of std::get(val) on a std::tuple" tuple_size = len(get_template_arg_list(val.type)) @@ -108,6 +112,7 @@ def tuple_get(n, val): n -= 1 return _tuple_impl_get(node) + def get_unique_ptr_data_ptr(val): "Return the result of val.get() on a std::unique_ptr" # std::unique_ptr contains a std::tuple, @@ -219,13 +224,37 @@ def display_hint(self): return 'array' -def lookup_type(val): - if not str(val.type.unqualified()).startswith('gko::'): +class GkoHalfPrinter: + "Print a gko::half" + + def __init__(self, val): + # GDB doesn't seem to consider the user-defined conversion in its Value.cast, + # so we need to call the conversion operator explicitly + address = hex(val.address) + self.float_val = gdb.parse_and_eval(f"reinterpret_cast({address})->operator float()") + + def to_string(self): + self.float_val.fetch_lazy() + return self.float_val + + +def create_printer(val, type_suffix, type_printer): + val_type = gdb.types.get_basic_type(val.type) + if not str(val_type).startswith('gko::'): return None - suffix = str(val.type.unqualified())[5:] - if suffix.startswith('array'): - return GkoArrayPrinter(val) + suffix = str(val_type)[5:] + if suffix.startswith(type_suffix): + return type_printer(val) return None -gdb.pretty_printers.append(lookup_type) +def gko_array(val): + return create_printer(val, 'array', GkoArrayPrinter) + + +def gko_half(val): + return create_printer(val, 'half', GkoHalfPrinter) + + +gdb.pretty_printers.append(gko_array) +gdb.pretty_printers.append(gko_half) From c1c1551b72ca903e2c32e092cb276d4c53aa8e6c Mon Sep 17 00:00:00 2001 From: "Yu-Hsiang M. Tsai" Date: Thu, 12 Jan 2023 09:30:49 -0600 Subject: [PATCH 09/48] add the rand template not_implemented --- cuda/base/curand_bindings.hpp | 13 +++++++++++++ cuda/solver/idr_kernels.cu | 8 ++++---- dpcpp/solver/idr_kernels.dp.cpp | 10 ++++++---- hip/base/hiprand_bindings.hip.hpp | 13 +++++++++++++ 4 files changed, 36 insertions(+), 8 deletions(-) diff --git a/cuda/base/curand_bindings.hpp b/cuda/base/curand_bindings.hpp index 429481ec9b6..d53af925df0 100644 --- a/cuda/base/curand_bindings.hpp +++ b/cuda/base/curand_bindings.hpp @@ -53,6 +53,17 @@ namespace cuda { * @ingroup curand */ namespace curand { +namespace detail { + + +template +inline int64 not_implemented(Args...) +{ + return static_cast(CURAND_STATUS_TYPE_ERROR); +} + + +} // namespace detail template @@ -101,6 +112,8 @@ GKO_BIND_CURAND_RANDOM_VECTOR(float, curandGenerateNormal); GKO_BIND_CURAND_RANDOM_VECTOR(double, curandGenerateNormalDouble); GKO_BIND_CURAND_RANDOM_VECTOR(std::complex, curandGenerateNormal); GKO_BIND_CURAND_RANDOM_VECTOR(std::complex, curandGenerateNormalDouble); +template +GKO_BIND_CURAND_RANDOM_VECTOR(ValueType, detail::not_implemented); #undef GKO_BIND_CURAND_RANDOM_VECTOR diff --git a/cuda/solver/idr_kernels.cu b/cuda/solver/idr_kernels.cu index 4d41a79a7ba..10e8a7b2fc3 100644 --- a/cuda/solver/idr_kernels.cu +++ b/cuda/solver/idr_kernels.cu @@ -100,10 +100,10 @@ void initialize_subspace_vectors(std::shared_ptr exec, auto gen = curand::rand_generator(std::random_device{}(), CURAND_RNG_PSEUDO_DEFAULT, exec->get_stream()); - // curand::rand_vector( - // gen, - // subspace_vectors->get_size()[0] * subspace_vectors->get_stride(), - // 0.0, 1.0, subspace_vectors->get_values()); + curand::rand_vector( + gen, + subspace_vectors->get_size()[0] * subspace_vectors->get_stride(), + 0.0, 1.0, subspace_vectors->get_values()); } } diff --git a/dpcpp/solver/idr_kernels.dp.cpp b/dpcpp/solver/idr_kernels.dp.cpp index 452ffef9c5b..a9f5aa5b3c1 100644 --- a/dpcpp/solver/idr_kernels.dp.cpp +++ b/dpcpp/solver/idr_kernels.dp.cpp @@ -636,11 +636,13 @@ void initialize_subspace_vectors(std::shared_ptr exec, cgh.parallel_for(sycl::range<1>(n), [=](sycl::item<1> idx) { std::uint64_t offset = idx.get_linear_id(); oneapi::dpl::minstd_rand engine(seed, offset); - // oneapi::dpl::normal_distribution> - // distr(0, 1); - // auto res = distr(engine); + oneapi::dpl::normal_distribution< + typename ::gko::detail::arth_type< + remove_complex>::type> + distr(0, 1); + auto res = distr(engine); - // work[idx] = res; + work[idx] = res; }); }); } diff --git a/hip/base/hiprand_bindings.hip.hpp b/hip/base/hiprand_bindings.hip.hpp index 14e144f6d84..4fb0703443b 100644 --- a/hip/base/hiprand_bindings.hip.hpp +++ b/hip/base/hiprand_bindings.hip.hpp @@ -58,6 +58,17 @@ namespace hip { * @ingroup hiprand */ namespace hiprand { +namespace detail { + + +template +inline int64 not_implemented(Args...) +{ + return static_cast(HIPRAND_STATUS_TYPE_ERROR); +} + + +} // namespace detail template @@ -107,6 +118,8 @@ GKO_BIND_HIPRAND_RANDOM_VECTOR(double, hiprandGenerateNormalDouble); GKO_BIND_HIPRAND_RANDOM_VECTOR(std::complex, hiprandGenerateNormal); GKO_BIND_HIPRAND_RANDOM_VECTOR(std::complex, hiprandGenerateNormalDouble); +template +GKO_BIND_HIPRAND_RANDOM_VECTOR(ValueType, detail::not_implemented); #undef GKO_BIND_HIPRAND_RANDOM_VECTOR From 3f1165758de7fd28cf4895d83392624eaed827fe Mon Sep 17 00:00:00 2001 From: "Yu-Hsiang M. Tsai" Date: Thu, 12 Jan 2023 16:26:23 -0600 Subject: [PATCH 10/48] this version can compile/run complex on cuda114 stack size can not be determined statically issue some tests are failed --- common/cuda_hip/base/math.hpp.inc | 12 +++--- common/cuda_hip/components/volatile.hpp.inc | 8 ++-- core/base/extended_float.hpp | 25 ++++++++--- core/test/utils/assertions.hpp | 10 +++-- cuda/base/types.hpp | 47 ++++++++++++++++++++- include/ginkgo/core/base/math.hpp | 16 ++++++- include/ginkgo/core/base/types.hpp | 4 +- 7 files changed, 99 insertions(+), 23 deletions(-) diff --git a/common/cuda_hip/base/math.hpp.inc b/common/cuda_hip/base/math.hpp.inc index d533f181222..54a165b8494 100644 --- a/common/cuda_hip/base/math.hpp.inc +++ b/common/cuda_hip/base/math.hpp.inc @@ -49,14 +49,14 @@ struct remove_complex_impl> { }; -template -struct is_complex_impl> - : public std::integral_constant {}; +// template +// struct is_complex_impl> +// : public std::integral_constant {}; -template -struct is_complex_or_scalar_impl> - : is_complex_or_scalar_impl {}; +// template +// struct is_complex_or_scalar_impl> +// : is_complex_or_scalar_impl {}; template diff --git a/common/cuda_hip/components/volatile.hpp.inc b/common/cuda_hip/components/volatile.hpp.inc index 402f73f088e..d9c56c71238 100644 --- a/common/cuda_hip/components/volatile.hpp.inc +++ b/common/cuda_hip/components/volatile.hpp.inc @@ -40,9 +40,11 @@ __device__ __forceinline__ } template -__device__ __forceinline__ std::enable_if_t< - std::is_floating_point::value, thrust::complex> -load(const thrust::complex* values, IndexType index) +__device__ __forceinline__ + std::enable_if_t::value || + std::is_same::value, + thrust::complex> + load(const thrust::complex* values, IndexType index) { auto real = reinterpret_cast(values); auto imag = real + 1; diff --git a/core/base/extended_float.hpp b/core/base/extended_float.hpp index 1bcfa76e818..263d5f1e833 100644 --- a/core/base/extended_float.hpp +++ b/core/base/extended_float.hpp @@ -579,16 +579,18 @@ class complex { : complex(static_cast(real), static_cast(imag)) {} - template + template ::value>> complex(const T& real) : complex(static_cast(real)) {} - template - explicit complex(const complex& other) + template ::value>> + explicit complex(const complex& other) : complex(static_cast(other.real()), static_cast(other.imag())) {} + // explicit complex(const complex& other) = default; + value_type real() const noexcept { return real_; } value_type imag() const noexcept { return imag_; } @@ -600,6 +602,12 @@ class complex { static_cast(imag_)); } + operator std::complex() const noexcept + { + return std::complex(static_cast(real_), + static_cast(imag_)); + } + template complex& operator=(const V& val) { @@ -656,13 +664,18 @@ class complex { template complex& operator*=(const complex& val) { - *this = *this * complex(val.real(), val.imag()); + auto tmp = real_; + real_ = real_ * val.real() - imag_ * val.imag(); + imag_ = tmp * val.imag() + imag_ * val.real(); return *this; } template complex& operator/=(const complex& val) { - *this = *this / complex(val.real(), val.imag()); + auto real = val.real(); + auto imag = val.imag(); + (*this) *= complex{val.real(), -val.imag()}; + (*this) /= (real * real + imag * imag); return *this; } @@ -738,6 +751,8 @@ struct numeric_limits { } }; +// complex using a template on operator= for any kind of complex, so we can +// do full specialization for half template <> inline complex& complex::operator=( const std::complex& a) diff --git a/core/test/utils/assertions.hpp b/core/test/utils/assertions.hpp index 153907cf2cf..a0bbccbb393 100644 --- a/core/test/utils/assertions.hpp +++ b/core/test/utils/assertions.hpp @@ -669,15 +669,19 @@ ::testing::AssertionResult values_near, std::complex>( std::complex val2, double abs_error) { using T = std::complex; - const double diff = abs(T{val1} - T{val2}); + T Tval1; + T Tval2; + Tval1 = val1; + Tval2 = val2; + const double diff = abs(Tval1 - Tval2); if (diff <= abs_error) return ::testing::AssertionSuccess(); return ::testing::AssertionFailure() << "The difference between " << first_expression << " and " << second_expression << " is " << diff << ", which exceeds " << tolerance_expression << ", where\n" - << first_expression << " evaluates to " << T{val1} << ",\n" - << second_expression << " evaluates to " << T{val2} << ", and\n" + << first_expression << " evaluates to " << Tval1 << ",\n" + << second_expression << " evaluates to " << Tval2 << ", and\n" << tolerance_expression << " evaluates to " << abs_error << "."; } diff --git a/cuda/base/types.hpp b/cuda/base/types.hpp index 148fdfde2dd..6e8d93bbc9a 100644 --- a/cuda/base/types.hpp +++ b/cuda/base/types.hpp @@ -49,6 +49,43 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +// namespace std { +GKO_ATTRIBUTES GKO_INLINE __half hypot(__half a, __half b) +{ + return hypot(static_cast(a), static_cast(b)); +} + +GKO_ATTRIBUTES GKO_INLINE thrust::complex<__half> sqrt( + thrust::complex<__half> a) +{ + return sqrt(static_cast>(a)); +} + +// } // namespace std + +namespace thrust { +template <> +GKO_ATTRIBUTES GKO_INLINE __half abs<__half>(const complex<__half>& z) +{ + return hypot(z.real(), z.imag()); +} + +} // namespace thrust + + +#define THRUST_HALF_FRIEND_OPERATOR(_op, _opeq) \ + GKO_ATTRIBUTES GKO_INLINE thrust::complex<__half> operator _op( \ + const thrust::complex<__half> lhs, const thrust::complex<__half> rhs) \ + { \ + auto result = lhs; \ + result _opeq rhs; \ + return result; \ + } + +THRUST_HALF_FRIEND_OPERATOR(+, +=) +THRUST_HALF_FRIEND_OPERATOR(-, -=) +THRUST_HALF_FRIEND_OPERATOR(*, *=) +THRUST_HALF_FRIEND_OPERATOR(/, /=) namespace gko { @@ -76,6 +113,13 @@ __device__ __forceinline__ bool is_nan(const __half& val) #endif +template <> +__device__ __forceinline__ bool is_nan(const thrust::complex<__half>& val) +{ + return is_nan(val.real()) || is_nan(val.imag()); +} + + namespace kernels { namespace cuda { @@ -277,7 +321,7 @@ struct cuda_struct_member_type_impl { template struct cuda_struct_member_type_impl> { - using type = fake_complex; + using type = fake_complex::type>; }; template <> @@ -306,6 +350,7 @@ GKO_CUDA_DATA_TYPE(float, CUDA_R_32F); GKO_CUDA_DATA_TYPE(double, CUDA_R_64F); GKO_CUDA_DATA_TYPE(std::complex, CUDA_C_32F); GKO_CUDA_DATA_TYPE(std::complex, CUDA_C_64F); +GKO_CUDA_DATA_TYPE(std::complex, CUDA_C_16F); GKO_CUDA_DATA_TYPE(int32, CUDA_R_32I); GKO_CUDA_DATA_TYPE(int8, CUDA_R_8I); diff --git a/include/ginkgo/core/base/math.hpp b/include/ginkgo/core/base/math.hpp index c83b1ca937f..aad0f9b07e3 100644 --- a/include/ginkgo/core/base/math.hpp +++ b/include/ginkgo/core/base/math.hpp @@ -56,7 +56,10 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. class __half; - +namespace thrust { +template +class complex; +} namespace std { @@ -71,7 +74,8 @@ inline gko::half sqrt(gko::half a) { return gko::half(sqrt(float(a))); } inline std::complex sqrt(std::complex a) { - return std::complex(sqrt(std::complex(a))); + return std::complex(sqrt(std::complex( + static_cast(a.real()), static_cast(a.imag())))); } @@ -207,6 +211,10 @@ template struct is_complex_impl> : public std::integral_constant {}; +template +struct is_complex_impl> + : public std::integral_constant {}; + template struct is_complex_or_scalar_impl : std::is_scalar {}; @@ -221,6 +229,10 @@ template struct is_complex_or_scalar_impl> : is_complex_or_scalar_impl {}; +template +struct is_complex_or_scalar_impl> + : is_complex_or_scalar_impl {}; + /** * template_converter is converting the template parameters of a class by diff --git a/include/ginkgo/core/base/types.hpp b/include/ginkgo/core/base/types.hpp index 3e83288ff16..8e88828ad29 100644 --- a/include/ginkgo/core/base/types.hpp +++ b/include/ginkgo/core/base/types.hpp @@ -420,9 +420,7 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, #if GINKGO_COMPILE_KERNEL -#define GKO_ADAPT_CPHF(_macro) \ - template <> \ - _macro GKO_NOT_IMPLEMENTED +#define GKO_ADAPT_CPHF(_macro) template _macro #else #define GKO_ADAPT_CPHF(_macro) template _macro #endif From 6b9546b569cc861bc94b5bef71692536c125cbcb Mon Sep 17 00:00:00 2001 From: "Yu-Hsiang M. Tsai" Date: Fri, 13 Jan 2023 21:47:30 -0600 Subject: [PATCH 11/48] does not work for the other executor --- core/base/extended_float.hpp | 4 +++- core/test/utils/assertions.hpp | 8 +++---- cuda/base/types.hpp | 14 ++++++++---- hip/base/types.hip.hpp | 42 +++++++++++++++++++++++++++++++++- 4 files changed, 57 insertions(+), 11 deletions(-) diff --git a/core/base/extended_float.hpp b/core/base/extended_float.hpp index 263d5f1e833..bf79c75b519 100644 --- a/core/base/extended_float.hpp +++ b/core/base/extended_float.hpp @@ -574,7 +574,9 @@ class complex { const value_type& imag = value_type(0.f)) : real_(real), imag_(imag) {} - template + template ::value && + std::is_scalar::value>> explicit complex(const T& real, const U& imag) : complex(static_cast(real), static_cast(imag)) {} diff --git a/core/test/utils/assertions.hpp b/core/test/utils/assertions.hpp index a0bbccbb393..a8767ba5526 100644 --- a/core/test/utils/assertions.hpp +++ b/core/test/utils/assertions.hpp @@ -669,10 +669,10 @@ ::testing::AssertionResult values_near, std::complex>( std::complex val2, double abs_error) { using T = std::complex; - T Tval1; - T Tval2; - Tval1 = val1; - Tval2 = val2; + // T{val1} calls the constructor of complex() -> which gives the + // complex(double/float) ambiguous + T Tval1 = val1; + T Tval2 = val2; const double diff = abs(Tval1 - Tval2); if (diff <= abs_error) return ::testing::AssertionSuccess(); diff --git a/cuda/base/types.hpp b/cuda/base/types.hpp index 6e8d93bbc9a..a65307016e0 100644 --- a/cuda/base/types.hpp +++ b/cuda/base/types.hpp @@ -49,7 +49,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include -// namespace std { + +// thrust calls the c function not the function from std +// Maybe override the function from thrust directlry GKO_ATTRIBUTES GKO_INLINE __half hypot(__half a, __half b) { return hypot(static_cast(a), static_cast(b)); @@ -61,15 +63,18 @@ GKO_ATTRIBUTES GKO_INLINE thrust::complex<__half> sqrt( return sqrt(static_cast>(a)); } -// } // namespace std namespace thrust { + + +// Dircetly call float versrion from here? template <> GKO_ATTRIBUTES GKO_INLINE __half abs<__half>(const complex<__half>& z) { return hypot(z.real(), z.imag()); } + } // namespace thrust @@ -77,9 +82,7 @@ GKO_ATTRIBUTES GKO_INLINE __half abs<__half>(const complex<__half>& z) GKO_ATTRIBUTES GKO_INLINE thrust::complex<__half> operator _op( \ const thrust::complex<__half> lhs, const thrust::complex<__half> rhs) \ { \ - auto result = lhs; \ - result _opeq rhs; \ - return result; \ + return thrust::complex{lhs} + thrust::complex(rhs); \ } THRUST_HALF_FRIEND_OPERATOR(+, +=) @@ -87,6 +90,7 @@ THRUST_HALF_FRIEND_OPERATOR(-, -=) THRUST_HALF_FRIEND_OPERATOR(*, *=) THRUST_HALF_FRIEND_OPERATOR(/, /=) + namespace gko { diff --git a/hip/base/types.hip.hpp b/hip/base/types.hip.hpp index 23f8fc55fc2..6dbcdf01325 100644 --- a/hip/base/types.hip.hpp +++ b/hip/base/types.hip.hpp @@ -54,6 +54,46 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +// thrust calls the c function not the function from std +// Maybe override the function from thrust directlry +GKO_ATTRIBUTES GKO_INLINE __half hypot(__half a, __half b) +{ + return hypot(static_cast(a), static_cast(b)); +} + +GKO_ATTRIBUTES GKO_INLINE thrust::complex<__half> sqrt( + thrust::complex<__half> a) +{ + return sqrt(static_cast>(a)); +} + + +namespace thrust { + + +// Dircetly call float versrion from here? +template <> +GKO_ATTRIBUTES GKO_INLINE __half abs<__half>(const complex<__half>& z) +{ + return hypot(static_cast(z.real()), static_cast(z.imag())); +} + + +} // namespace thrust + +#define THRUST_HALF_FRIEND_OPERATOR(_op, _opeq) \ + GKO_ATTRIBUTES GKO_INLINE thrust::complex<__half> operator _op( \ + const thrust::complex<__half> lhs, const thrust::complex<__half> rhs) \ + { \ + return thrust::complex{lhs} + thrust::complex(rhs); \ + } + +THRUST_HALF_FRIEND_OPERATOR(+, +=) +THRUST_HALF_FRIEND_OPERATOR(-, -=) +THRUST_HALF_FRIEND_OPERATOR(*, *=) +THRUST_HALF_FRIEND_OPERATOR(/, /=) + + namespace gko { #if defined(__CUDA_ARCH__) #if __CUDA_ARCH__ >= 700 @@ -327,7 +367,7 @@ struct hip_struct_member_type_impl { template struct hip_struct_member_type_impl> { - using type = fake_complex; + using type = fake_complex::type>; }; template <> From 384371b25442341543d54bf1712e59d8da3a4139 Mon Sep 17 00:00:00 2001 From: "Yuhsiang M. Tsai" Date: Mon, 6 Feb 2023 22:41:38 +0100 Subject: [PATCH 12/48] fix complex issue and sqrt issue sqrt need to be global namespace to live with hip sqrt --- core/base/extended_float.hpp | 22 +++++---- core/preconditioner/jacobi.cpp | 4 +- hip/base/types.hip.hpp | 46 ++++++++++--------- .../jacobi_generate_instantiate.inc.hip.cpp | 24 +++++----- include/ginkgo/core/base/half.hpp | 24 ++++++++++ reference/matrix/ell_kernels.cpp | 3 +- reference/solver/idr_kernels.cpp | 12 +++-- test/solver/solver.cpp | 2 + 8 files changed, 86 insertions(+), 51 deletions(-) create mode 100644 include/ginkgo/core/base/half.hpp diff --git a/core/base/extended_float.hpp b/core/base/extended_float.hpp index bf79c75b519..6898c4ffaa2 100644 --- a/core/base/extended_float.hpp +++ b/core/base/extended_float.hpp @@ -54,6 +54,8 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#else +class __half; #endif // __CUDA_ARCH__ @@ -101,7 +103,7 @@ struct basic_float_traits { static constexpr bool rounds_to_nearest = true; }; -#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) +// #if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) template <> struct basic_float_traits<__half> { using type = __half; @@ -110,7 +112,7 @@ struct basic_float_traits<__half> { static constexpr int exponent_bits = 5; static constexpr bool rounds_to_nearest = true; }; -#endif +// #endif template <> struct basic_float_traits { @@ -598,17 +600,17 @@ class complex { value_type imag() const noexcept { return imag_; } - operator std::complex() const noexcept + operator std::complex() const noexcept { - return std::complex(static_cast(real_), - static_cast(imag_)); + return std::complex(static_cast(real_), + static_cast(imag_)); } - operator std::complex() const noexcept - { - return std::complex(static_cast(real_), - static_cast(imag_)); - } + // operator std::complex() const noexcept + // { + // return std::complex(static_cast(real_), + // static_cast(imag_)); + // } template complex& operator=(const V& val) diff --git a/core/preconditioner/jacobi.cpp b/core/preconditioner/jacobi.cpp index 5f8d194fcd6..75f5e941303 100644 --- a/core/preconditioner/jacobi.cpp +++ b/core/preconditioner/jacobi.cpp @@ -319,7 +319,9 @@ void Jacobi::generate(const LinOp* system_matrix, ->extract_diagonal_linop()); auto diag_vt = ::gko::detail::temporary_conversion>:: - template create>>( + template create>, + matrix::Diagonal>>>( diag.get()); if (!diag_vt) { GKO_NOT_SUPPORTED(system_matrix); diff --git a/hip/base/types.hip.hpp b/hip/base/types.hip.hpp index 6dbcdf01325..79ecaa9c93f 100644 --- a/hip/base/types.hip.hpp +++ b/hip/base/types.hip.hpp @@ -67,6 +67,28 @@ GKO_ATTRIBUTES GKO_INLINE thrust::complex<__half> sqrt( return sqrt(static_cast>(a)); } +// __device__ __forceinline__ float sqrt(float val) { return sqrtf(val); } +// __device__ __forceinline__ double sqrt(double val) { return ::sqrt(val); } +__device__ __forceinline__ thrust::complex sqrt( + thrust::complex val) +{ + return thrust::sqrt(val); +} +__device__ __forceinline__ thrust::complex sqrt( + thrust::complex val) +{ + return thrust::sqrt(val); +} + +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 700 +__device__ __forceinline__ __half sqrt(__half val) +{ + return sqrt(static_cast(val)); +} +#else +__device__ __forceinline__ __half sqrt(__half val) { return hsqrt(val); } +#endif + namespace thrust { @@ -126,29 +148,9 @@ __device__ __forceinline__ __half abs(const __half& val) { return __habs(val); } #endif -#if defined(__HIPCC__) -__device__ __forceinline__ float sqrt(float val) { return sqrtf(val); } -__device__ __forceinline__ double sqrt(double val) { return sqrt(val); } -__device__ __forceinline__ thrust::complex sqrt( - thrust::complex val) -{ - return thrust::sqrt(val); -} -__device__ __forceinline__ thrust::complex sqrt( - thrust::complex val) -{ - return thrust::sqrt(val); -} +// #if defined(__HIPCC__) -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 700 -__device__ __forceinline__ __half sqrt(__half val) -{ - return sqrt(static_cast(val)); -} -#else -__device__ __forceinline__ __half sqrt(__half val) { return hsqrt(val); } -#endif -#endif +// #endif namespace kernels { diff --git a/hip/preconditioner/jacobi_generate_instantiate.inc.hip.cpp b/hip/preconditioner/jacobi_generate_instantiate.inc.hip.cpp index d68dc5797db..fa970818622 100644 --- a/hip/preconditioner/jacobi_generate_instantiate.inc.hip.cpp +++ b/hip/preconditioner/jacobi_generate_instantiate.inc.hip.cpp @@ -53,18 +53,18 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. namespace gko { -namespace detail { -#if !defined(__HIP_DEVICE_COMPILE__) -template <> -struct basic_float_traits<__half> { - using type = __half; - static constexpr int sign_bits = 1; - static constexpr int significand_bits = 10; - static constexpr int exponent_bits = 5; - static constexpr bool rounds_to_nearest = true; -}; -#endif -} // namespace detail +// namespace detail { +// #if !defined(__HIP_DEVICE_COMPILE__) +// template <> +// struct basic_float_traits<__half> { +// using type = __half; +// static constexpr int sign_bits = 1; +// static constexpr int significand_bits = 10; +// static constexpr int exponent_bits = 5; +// static constexpr bool rounds_to_nearest = true; +// }; +// #endif +// } // namespace detail namespace kernels { namespace hip { /** diff --git a/include/ginkgo/core/base/half.hpp b/include/ginkgo/core/base/half.hpp new file mode 100644 index 00000000000..7ef38f68247 --- /dev/null +++ b/include/ginkgo/core/base/half.hpp @@ -0,0 +1,24 @@ +#ifndef GKO_BASE_HALF_HPP_ +#define GKO_BASE_HALF_HPP_ +#include +#include + + +#ifdef __CUDA_ARCH__ + + +#include + + +#elif defined(__HIP_DEVICE_COMPILE__) + + +#include + + +#endif // __CUDA_ARCH__ + + +namespace gko {} + +#endif // GKO_BASE_HALF_HPP_ diff --git a/reference/matrix/ell_kernels.cpp b/reference/matrix/ell_kernels.cpp index 6a78490af9c..b4de6c418f4 100644 --- a/reference/matrix/ell_kernels.cpp +++ b/reference/matrix/ell_kernels.cpp @@ -137,7 +137,8 @@ void advanced_spmv(std::shared_ptr exec, for (size_type j = 0; j < c->get_size()[1]; j++) { for (size_type row = 0; row < a->get_size()[0]; row++) { - arithmetic_type result = c->at(row, j); + arithmetic_type result = + static_cast(c->at(row, j)); result *= beta_val; for (size_type i = 0; i < num_stored_elements_per_row; i++) { arithmetic_type val = a_vals(row + i * stride); diff --git a/reference/solver/idr_kernels.cpp b/reference/solver/idr_kernels.cpp index 79ca67866bb..f9604a55313 100644 --- a/reference/solver/idr_kernels.cpp +++ b/reference/solver/idr_kernels.cpp @@ -152,15 +152,17 @@ void initialize(std::shared_ptr exec, // Initialize and Orthonormalize P const auto num_rows = subspace_vectors->get_size()[0]; const auto num_cols = subspace_vectors->get_size()[1]; - // auto dist = std::normal_distribution>(0.0, 1.0); + auto dist = std::normal_distribution< + typename ::gko::detail::arth_type>::type>( + 0.0, 1.0); auto seed = std::random_device{}(); auto gen = std::default_random_engine(seed); for (size_type row = 0; row < num_rows; row++) { if (!deterministic) { - // for (size_type col = 0; col < num_cols; col++) { - // subspace_vectors->at(row, col) = - // // get_rand_value(dist, gen); - // } + for (size_type col = 0; col < num_cols; col++) { + subspace_vectors->at(row, col) = + get_rand_value(dist, gen); + } } for (size_type i = 0; i < row; i++) { diff --git a/test/solver/solver.cpp b/test/solver/solver.cpp index 30bddf11535..a6adb0394d3 100644 --- a/test/solver/solver.cpp +++ b/test/solver/solver.cpp @@ -995,6 +995,7 @@ TYPED_TEST(Solver, MixedApplyIsEquivalentToRef) solver.ref->apply(b.ref, x.ref); solver.dev->apply(b.dev, x.dev); + // TODO: in double with half, 4 iterations leads inf GKO_ASSERT_MTX_NEAR(x.ref, x.dev, this->mixed_tol(x)); }); }); @@ -1013,6 +1014,7 @@ TYPED_TEST(Solver, MixedAdvancedApplyIsEquivalentToRef) solver.ref->apply(alpha.ref, b.ref, beta.ref, x.ref); solver.dev->apply(alpha.dev, b.dev, beta.dev, x.dev); + // TODO: in double with half, 4 iterations leads inf GKO_ASSERT_MTX_NEAR(x.ref, x.dev, this->mixed_tol(x)); }); }); From c04d7d02f59740f792cf51cc65601331b1f8deea Mon Sep 17 00:00:00 2001 From: "Yuhsiang M. Tsai" Date: Mon, 6 Feb 2023 23:18:22 +0100 Subject: [PATCH 13/48] try fix the compilation issue from MSVC and MacOS It seems to use complex version even using half only --- core/base/extended_float.hpp | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/core/base/extended_float.hpp b/core/base/extended_float.hpp index 6898c4ffaa2..4dcfb481b16 100644 --- a/core/base/extended_float.hpp +++ b/core/base/extended_float.hpp @@ -683,6 +683,23 @@ class complex { return *this; } +// It's for MacOS. +// TODO: check whether mac compiler always use complex version even when real +// half +#define COMPLEX_HALF_OPERATOR(_op, _opeq) \ + GKO_ATTRIBUTES friend complex operator _op( \ + const complex lhf, const complex rhf) \ + { \ + auto a = lhf; \ + a _opeq rhf; \ + return a; \ + } + + COMPLEX_HALF_OPERATOR(+, +=) + COMPLEX_HALF_OPERATOR(-, -=) + COMPLEX_HALF_OPERATOR(*, *=) + COMPLEX_HALF_OPERATOR(/, /=) + private: value_type real_; value_type imag_; @@ -766,6 +783,17 @@ inline complex& complex::operator=( return *this; } +// For MSVC +template <> +inline complex& complex::operator=( + const std::complex& a) +{ + complex t(a.real(), a.imag()); + operator=(t); + return *this; +} + + } // namespace std From efb9aea768a2e31d53753d0b18f160dcd5a482de Mon Sep 17 00:00:00 2001 From: "Yuhsiang M. Tsai" Date: Tue, 7 Feb 2023 13:08:01 +0100 Subject: [PATCH 14/48] move the half to public and use sycl::half for dpcpp --- core/base/extended_float.hpp | 608 -------------------------- include/ginkgo/core/base/half.hpp | 679 ++++++++++++++++++++++++++++- include/ginkgo/core/base/types.hpp | 10 +- 3 files changed, 683 insertions(+), 614 deletions(-) diff --git a/core/base/extended_float.hpp b/core/base/extended_float.hpp index 4dcfb481b16..2ed7c8b2626 100644 --- a/core/base/extended_float.hpp +++ b/core/base/extended_float.hpp @@ -62,418 +62,6 @@ class __half; namespace gko { -template -class truncated; - - -namespace detail { - - -template -struct uint_of_impl {}; - -template -struct uint_of_impl> { - using type = uint16; -}; - -template -struct uint_of_impl> { - using type = uint32; -}; - -template -struct uint_of_impl> { - using type = uint64; -}; - -template -using uint_of = typename uint_of_impl::type; - - -template -struct basic_float_traits {}; - -template <> -struct basic_float_traits { - using type = float16; - static constexpr int sign_bits = 1; - static constexpr int significand_bits = 10; - static constexpr int exponent_bits = 5; - static constexpr bool rounds_to_nearest = true; -}; - -// #if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) -template <> -struct basic_float_traits<__half> { - using type = __half; - static constexpr int sign_bits = 1; - static constexpr int significand_bits = 10; - static constexpr int exponent_bits = 5; - static constexpr bool rounds_to_nearest = true; -}; -// #endif - -template <> -struct basic_float_traits { - using type = float32; - static constexpr int sign_bits = 1; - static constexpr int significand_bits = 23; - static constexpr int exponent_bits = 8; - static constexpr bool rounds_to_nearest = true; -}; - -template <> -struct basic_float_traits { - using type = float64; - static constexpr int sign_bits = 1; - static constexpr int significand_bits = 52; - static constexpr int exponent_bits = 11; - static constexpr bool rounds_to_nearest = true; -}; - -template -struct basic_float_traits> { - using type = truncated; - static constexpr int sign_bits = ComponentId == 0 ? 1 : 0; - static constexpr int exponent_bits = - ComponentId == 0 ? basic_float_traits::exponent_bits : 0; - static constexpr int significand_bits = - ComponentId == 0 ? sizeof(type) * byte_size - exponent_bits - 1 - : sizeof(type) * byte_size; - static constexpr bool rounds_to_nearest = false; -}; - - -template -constexpr UintType create_ones(int n) -{ - return (n == sizeof(UintType) * byte_size ? static_cast(0) - : static_cast(1) << n) - - static_cast(1); -} - -template -struct float_traits { - using type = typename basic_float_traits::type; - using bits_type = uint_of; - static constexpr int sign_bits = basic_float_traits::sign_bits; - static constexpr int significand_bits = - basic_float_traits::significand_bits; - static constexpr int exponent_bits = basic_float_traits::exponent_bits; - static constexpr bits_type significand_mask = - create_ones(significand_bits); - static constexpr bits_type exponent_mask = - create_ones(significand_bits + exponent_bits) - - significand_mask; - static constexpr bits_type bias_mask = - create_ones(significand_bits + exponent_bits - 1) - - significand_mask; - static constexpr bits_type sign_mask = - create_ones(sign_bits + significand_bits + exponent_bits) - - exponent_mask - significand_mask; - static constexpr bool rounds_to_nearest = - basic_float_traits::rounds_to_nearest; - - static constexpr auto eps = - 1.0 / (1ll << (significand_bits + rounds_to_nearest)); - - static constexpr bool is_inf(bits_type data) - { - return (data & exponent_mask) == exponent_mask && - (data & significand_mask) == bits_type{}; - } - - static constexpr bool is_nan(bits_type data) - { - return (data & exponent_mask) == exponent_mask && - (data & significand_mask) != bits_type{}; - } - - static constexpr bool is_denom(bits_type data) - { - return (data & exponent_mask) == bits_type{}; - } -}; - - -template -struct precision_converter; - -// upcasting implementation details -template -struct precision_converter { - using source_traits = float_traits; - using result_traits = float_traits; - using source_bits = typename source_traits::bits_type; - using result_bits = typename result_traits::bits_type; - - static_assert(source_traits::exponent_bits <= - result_traits::exponent_bits && - source_traits::significand_bits <= - result_traits::significand_bits, - "SourceType has to have both lower range and precision or " - "higher range and precision than ResultType"); - - static constexpr int significand_offset = - result_traits::significand_bits - source_traits::significand_bits; - static constexpr int exponent_offset = significand_offset; - static constexpr int sign_offset = result_traits::exponent_bits - - source_traits::exponent_bits + - exponent_offset; - static constexpr result_bits bias_change = - result_traits::bias_mask - - (static_cast(source_traits::bias_mask) << exponent_offset); - - static constexpr result_bits shift_significand(source_bits data) noexcept - { - return static_cast(data & source_traits::significand_mask) - << significand_offset; - } - - static constexpr result_bits shift_exponent(source_bits data) noexcept - { - return update_bias( - static_cast(data & source_traits::exponent_mask) - << exponent_offset); - } - - static constexpr result_bits shift_sign(source_bits data) noexcept - { - return static_cast(data & source_traits::sign_mask) - << sign_offset; - } - -private: - static constexpr result_bits update_bias(result_bits data) noexcept - { - return data == typename result_traits::bits_type{} ? data - : data + bias_change; - } -}; - -// downcasting implementation details -template -struct precision_converter { - using source_traits = float_traits; - using result_traits = float_traits; - using source_bits = typename source_traits::bits_type; - using result_bits = typename result_traits::bits_type; - - static_assert(source_traits::exponent_bits >= - result_traits::exponent_bits && - source_traits::significand_bits >= - result_traits::significand_bits, - "SourceType has to have both lower range and precision or " - "higher range and precision than ResultType"); - - static constexpr int significand_offset = - source_traits::significand_bits - result_traits::significand_bits; - static constexpr int exponent_offset = significand_offset; - static constexpr int sign_offset = source_traits::exponent_bits - - result_traits::exponent_bits + - exponent_offset; - static constexpr source_bits bias_change = - (source_traits::bias_mask >> exponent_offset) - - static_cast(result_traits::bias_mask); - - static constexpr result_bits shift_significand(source_bits data) noexcept - { - return static_cast( - (data & source_traits::significand_mask) >> significand_offset); - } - - static constexpr result_bits shift_exponent(source_bits data) noexcept - { - return static_cast(update_bias( - (data & source_traits::exponent_mask) >> exponent_offset)); - } - - static constexpr result_bits shift_sign(source_bits data) noexcept - { - return static_cast((data & source_traits::sign_mask) >> - sign_offset); - } - -private: - static constexpr source_bits update_bias(source_bits data) noexcept - { - return data <= bias_change ? typename source_traits::bits_type{} - : limit_exponent(data - bias_change); - } - - static constexpr source_bits limit_exponent(source_bits data) noexcept - { - return data >= static_cast(result_traits::exponent_mask) - ? static_cast(result_traits::exponent_mask) - : data; - } -}; - - -} // namespace detail - - -/** - * A class providing basic support for half precision floating point types. - * - * For now the only features are reduced storage compared to single precision - * and conversions from and to single precision floating point type. - */ -class half { -public: - GKO_ATTRIBUTES half() noexcept = default; - - template ::value>> - GKO_ATTRIBUTES half(const T val) - { - this->float2half(static_cast(val)); - } - - GKO_ATTRIBUTES half(const half& val) = default; - - template - GKO_ATTRIBUTES half& operator=(const V val) - { - this->float2half(static_cast(val)); - return *this; - } - - GKO_ATTRIBUTES operator float() const noexcept - { -#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) - return __half2float(reinterpret_cast(data_)); -#else // defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) - const auto bits = half2float(data_); - return reinterpret_cast(bits); -#endif // defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) - } - - // can not use half operator _op(const half) for half + half - // operation will cast it to float and then do float operation such that it - // becomes float in the end. -#define HALF_OPERATOR(_op, _opeq) \ - GKO_ATTRIBUTES friend half operator _op(const half lhf, const half rhf) \ - { \ - return static_cast(static_cast(lhf) \ - _op static_cast(rhf)); \ - } \ - GKO_ATTRIBUTES half& operator _opeq(const half& hf) \ - { \ - auto result = *this _op hf; \ - this->float2half(result); \ - return *this; \ - } - HALF_OPERATOR(+, +=) - HALF_OPERATOR(-, -=) - HALF_OPERATOR(*, *=) - HALF_OPERATOR(/, /=) - - // Do operation with different type - // If it is floating point, using floating point as type. - // If it is integer, using half as type -#define HALF_FRIEND_OPERATOR(_op, _opeq) \ - template \ - GKO_ATTRIBUTES friend std::enable_if_t< \ - !std::is_same::value && std::is_scalar::value, \ - typename std::conditional::value, T, \ - half>::type> \ - operator _op(const half hf, const T val) \ - { \ - using type = \ - typename std::conditional::value, T, \ - half>::type; \ - auto result = static_cast(hf); \ - result _opeq static_cast(val); \ - return result; \ - } \ - template \ - GKO_ATTRIBUTES friend std::enable_if_t< \ - !std::is_same::value && std::is_scalar::value, \ - typename std::conditional::value, T, \ - half>::type> \ - operator _op(const T val, const half hf) \ - { \ - using type = \ - typename std::conditional::value, T, \ - half>::type; \ - auto result = static_cast(val); \ - result _opeq static_cast(hf); \ - return result; \ - } - - HALF_FRIEND_OPERATOR(+, +=) - HALF_FRIEND_OPERATOR(-, -=) - HALF_FRIEND_OPERATOR(*, *=) - HALF_FRIEND_OPERATOR(/, /=) - - // the negative - GKO_ATTRIBUTES half operator-() const - { - auto val = 0.0f - *this; - return half(val); - } - -private: - using f16_traits = detail::float_traits; - using f32_traits = detail::float_traits; - - // TODO: do we really need this one? - // Without it, everything can be constexpr, which might make stuff easier. - GKO_ATTRIBUTES void float2half(float val) noexcept - { -#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) - const auto tmp = __float2half_rn(val); - data_ = reinterpret_cast(tmp); -#else // defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) - data_ = float2half(reinterpret_cast(val)); -#endif // defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) - } - - static GKO_ATTRIBUTES uint16 float2half(uint32 data_) noexcept - { - using conv = detail::precision_converter; - if (f32_traits::is_inf(data_)) { - return conv::shift_sign(data_) | f16_traits::exponent_mask; - } else if (f32_traits::is_nan(data_)) { - return conv::shift_sign(data_) | f16_traits::exponent_mask | - f16_traits::significand_mask; - } else { - const auto exp = conv::shift_exponent(data_); - if (f16_traits::is_inf(exp)) { - return conv::shift_sign(data_) | exp; - } else if (f16_traits::is_denom(exp)) { - // TODO: handle denormals - return conv::shift_sign(data_); - } else { - return conv::shift_sign(data_) | exp | - conv::shift_significand(data_); - } - } - } - - static GKO_ATTRIBUTES uint32 half2float(uint16 data_) noexcept - { - using conv = detail::precision_converter; - if (f16_traits::is_inf(data_)) { - return conv::shift_sign(data_) | f32_traits::exponent_mask; - } else if (f16_traits::is_nan(data_)) { - return conv::shift_sign(data_) | f32_traits::exponent_mask | - f32_traits::significand_mask; - } else if (f16_traits::is_denom(data_)) { - // TODO: handle denormals - return conv::shift_sign(data_); - } else { - return conv::shift_sign(data_) | conv::shift_exponent(data_) | - conv::shift_significand(data_); - } - } - - uint16 data_; -}; - - /** * This template implements the truncated (or split) storage of a floating point * type. @@ -567,145 +155,6 @@ class truncated { namespace std { -template <> -class complex { -public: - using value_type = gko::half; - - complex(const value_type& real = value_type(0.f), - const value_type& imag = value_type(0.f)) - : real_(real), imag_(imag) - {} - template ::value && - std::is_scalar::value>> - explicit complex(const T& real, const U& imag) - : complex(static_cast(real), static_cast(imag)) - {} - - template ::value>> - complex(const T& real) : complex(static_cast(real)) - {} - - template ::value>> - explicit complex(const complex& other) - : complex(static_cast(other.real()), - static_cast(other.imag())) - {} - - // explicit complex(const complex& other) = default; - - value_type real() const noexcept { return real_; } - - value_type imag() const noexcept { return imag_; } - - - operator std::complex() const noexcept - { - return std::complex(static_cast(real_), - static_cast(imag_)); - } - - // operator std::complex() const noexcept - // { - // return std::complex(static_cast(real_), - // static_cast(imag_)); - // } - - template - complex& operator=(const V& val) - { - real_ = val; - imag_ = value_type(); - return *this; - } - - template - complex& operator=(const std::complex& val) - { - real_ = val.real(); - imag_ = val.imag(); - return *this; - } - - complex& operator+=(const value_type& real) - { - real_ += real; - return *this; - } - complex& operator-=(const value_type& real) - { - real_ -= real; - return *this; - } - complex& operator*=(const value_type& real) - { - real_ *= real; - imag_ *= real; - return *this; - } - complex& operator/=(const value_type& real) - { - real_ /= real; - imag_ /= real; - return *this; - } - - template - complex& operator+=(const complex& val) - { - real_ += val.real(); - imag_ += val.imag(); - return *this; - } - template - complex& operator-=(const complex& val) - { - real_ -= val.real(); - imag_ -= val.imag(); - return *this; - } - template - complex& operator*=(const complex& val) - { - auto tmp = real_; - real_ = real_ * val.real() - imag_ * val.imag(); - imag_ = tmp * val.imag() + imag_ * val.real(); - return *this; - } - template - complex& operator/=(const complex& val) - { - auto real = val.real(); - auto imag = val.imag(); - (*this) *= complex{val.real(), -val.imag()}; - (*this) /= (real * real + imag * imag); - return *this; - } - -// It's for MacOS. -// TODO: check whether mac compiler always use complex version even when real -// half -#define COMPLEX_HALF_OPERATOR(_op, _opeq) \ - GKO_ATTRIBUTES friend complex operator _op( \ - const complex lhf, const complex rhf) \ - { \ - auto a = lhf; \ - a _opeq rhf; \ - return a; \ - } - - COMPLEX_HALF_OPERATOR(+, +=) - COMPLEX_HALF_OPERATOR(-, -=) - COMPLEX_HALF_OPERATOR(*, *=) - COMPLEX_HALF_OPERATOR(/, /=) - -private: - value_type real_; - value_type imag_; -}; - - template class complex> { public: @@ -737,63 +186,6 @@ class complex> { }; -template <> -struct numeric_limits { - static constexpr bool is_specialized{true}; - static constexpr bool is_signed{true}; - static constexpr bool is_integer{false}; - static constexpr bool is_exact{false}; - static constexpr bool is_bounded{true}; - static constexpr bool is_modulo{false}; - static constexpr int digits{ - gko::detail::float_traits::significand_bits + 1}; - // 3/10 is approx. log_10(2) - static constexpr int digits10{digits * 3 / 10}; - - // Note: gko::half can't return gko::half here because it does not have - // a constexpr constructor. - static constexpr float epsilon() - { - return gko::detail::float_traits::eps; - } - - static constexpr float infinity() - { - return numeric_limits::infinity(); - } - - static constexpr float min() { return numeric_limits::min(); } - - static constexpr float max() { return numeric_limits::max(); } - - static constexpr float quiet_NaN() - { - return numeric_limits::quiet_NaN(); - } -}; - -// complex using a template on operator= for any kind of complex, so we can -// do full specialization for half -template <> -inline complex& complex::operator=( - const std::complex& a) -{ - complex t(a.real(), a.imag()); - operator=(t); - return *this; -} - -// For MSVC -template <> -inline complex& complex::operator=( - const std::complex& a) -{ - complex t(a.real(), a.imag()); - operator=(t); - return *this; -} - - } // namespace std diff --git a/include/ginkgo/core/base/half.hpp b/include/ginkgo/core/base/half.hpp index 7ef38f68247..09b3c7a0686 100644 --- a/include/ginkgo/core/base/half.hpp +++ b/include/ginkgo/core/base/half.hpp @@ -1,9 +1,50 @@ -#ifndef GKO_BASE_HALF_HPP_ -#define GKO_BASE_HALF_HPP_ +/************************************************************* +Copyright (c) 2017-2023, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#ifndef GKO_PUBLIC_CORE_BASE_HALF_HPP_ +#define GKO_PUBLIC_CORE_BASE_HALF_HPP_ + + #include #include +#include +#include + +#ifdef SYCL_LANGUAGE_VERSION +#include +#endif + #ifdef __CUDA_ARCH__ @@ -16,9 +57,639 @@ #include +#else + + +class __half; + + #endif // __CUDA_ARCH__ -namespace gko {} +namespace gko { + + +template +class truncated; + + +namespace detail { + + +template +struct uint_of_impl {}; + +template +struct uint_of_impl> { + using type = uint16; +}; + +template +struct uint_of_impl> { + using type = uint32; +}; + +template +struct uint_of_impl> { + using type = uint64; +}; + +template +using uint_of = typename uint_of_impl::type; + + +template +struct basic_float_traits {}; + +template <> +struct basic_float_traits { + using type = float16; + static constexpr int sign_bits = 1; + static constexpr int significand_bits = 10; + static constexpr int exponent_bits = 5; + static constexpr bool rounds_to_nearest = true; +}; + +// #if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) +template <> +struct basic_float_traits<__half> { + using type = __half; + static constexpr int sign_bits = 1; + static constexpr int significand_bits = 10; + static constexpr int exponent_bits = 5; + static constexpr bool rounds_to_nearest = true; +}; +// #endif + +template <> +struct basic_float_traits { + using type = float32; + static constexpr int sign_bits = 1; + static constexpr int significand_bits = 23; + static constexpr int exponent_bits = 8; + static constexpr bool rounds_to_nearest = true; +}; + +template <> +struct basic_float_traits { + using type = float64; + static constexpr int sign_bits = 1; + static constexpr int significand_bits = 52; + static constexpr int exponent_bits = 11; + static constexpr bool rounds_to_nearest = true; +}; + +template +struct basic_float_traits> { + using type = truncated; + static constexpr int sign_bits = ComponentId == 0 ? 1 : 0; + static constexpr int exponent_bits = + ComponentId == 0 ? basic_float_traits::exponent_bits : 0; + static constexpr int significand_bits = + ComponentId == 0 ? sizeof(type) * byte_size - exponent_bits - 1 + : sizeof(type) * byte_size; + static constexpr bool rounds_to_nearest = false; +}; + + +template +constexpr UintType create_ones(int n) +{ + return (n == sizeof(UintType) * byte_size ? static_cast(0) + : static_cast(1) << n) - + static_cast(1); +} + +template +struct float_traits { + using type = typename basic_float_traits::type; + using bits_type = uint_of; + static constexpr int sign_bits = basic_float_traits::sign_bits; + static constexpr int significand_bits = + basic_float_traits::significand_bits; + static constexpr int exponent_bits = basic_float_traits::exponent_bits; + static constexpr bits_type significand_mask = + create_ones(significand_bits); + static constexpr bits_type exponent_mask = + create_ones(significand_bits + exponent_bits) - + significand_mask; + static constexpr bits_type bias_mask = + create_ones(significand_bits + exponent_bits - 1) - + significand_mask; + static constexpr bits_type sign_mask = + create_ones(sign_bits + significand_bits + exponent_bits) - + exponent_mask - significand_mask; + static constexpr bool rounds_to_nearest = + basic_float_traits::rounds_to_nearest; + + static constexpr auto eps = + 1.0 / (1ll << (significand_bits + rounds_to_nearest)); + + static constexpr bool is_inf(bits_type data) + { + return (data & exponent_mask) == exponent_mask && + (data & significand_mask) == bits_type{}; + } + + static constexpr bool is_nan(bits_type data) + { + return (data & exponent_mask) == exponent_mask && + (data & significand_mask) != bits_type{}; + } + + static constexpr bool is_denom(bits_type data) + { + return (data & exponent_mask) == bits_type{}; + } +}; + + +template +struct precision_converter; + +// upcasting implementation details +template +struct precision_converter { + using source_traits = float_traits; + using result_traits = float_traits; + using source_bits = typename source_traits::bits_type; + using result_bits = typename result_traits::bits_type; + + static_assert(source_traits::exponent_bits <= + result_traits::exponent_bits && + source_traits::significand_bits <= + result_traits::significand_bits, + "SourceType has to have both lower range and precision or " + "higher range and precision than ResultType"); + + static constexpr int significand_offset = + result_traits::significand_bits - source_traits::significand_bits; + static constexpr int exponent_offset = significand_offset; + static constexpr int sign_offset = result_traits::exponent_bits - + source_traits::exponent_bits + + exponent_offset; + static constexpr result_bits bias_change = + result_traits::bias_mask - + (static_cast(source_traits::bias_mask) << exponent_offset); + + static constexpr result_bits shift_significand(source_bits data) noexcept + { + return static_cast(data & source_traits::significand_mask) + << significand_offset; + } + + static constexpr result_bits shift_exponent(source_bits data) noexcept + { + return update_bias( + static_cast(data & source_traits::exponent_mask) + << exponent_offset); + } + + static constexpr result_bits shift_sign(source_bits data) noexcept + { + return static_cast(data & source_traits::sign_mask) + << sign_offset; + } + +private: + static constexpr result_bits update_bias(result_bits data) noexcept + { + return data == typename result_traits::bits_type{} ? data + : data + bias_change; + } +}; + +// downcasting implementation details +template +struct precision_converter { + using source_traits = float_traits; + using result_traits = float_traits; + using source_bits = typename source_traits::bits_type; + using result_bits = typename result_traits::bits_type; + + static_assert(source_traits::exponent_bits >= + result_traits::exponent_bits && + source_traits::significand_bits >= + result_traits::significand_bits, + "SourceType has to have both lower range and precision or " + "higher range and precision than ResultType"); + + static constexpr int significand_offset = + source_traits::significand_bits - result_traits::significand_bits; + static constexpr int exponent_offset = significand_offset; + static constexpr int sign_offset = source_traits::exponent_bits - + result_traits::exponent_bits + + exponent_offset; + static constexpr source_bits bias_change = + (source_traits::bias_mask >> exponent_offset) - + static_cast(result_traits::bias_mask); + + static constexpr result_bits shift_significand(source_bits data) noexcept + { + return static_cast( + (data & source_traits::significand_mask) >> significand_offset); + } + + static constexpr result_bits shift_exponent(source_bits data) noexcept + { + return static_cast(update_bias( + (data & source_traits::exponent_mask) >> exponent_offset)); + } + + static constexpr result_bits shift_sign(source_bits data) noexcept + { + return static_cast((data & source_traits::sign_mask) >> + sign_offset); + } + +private: + static constexpr source_bits update_bias(source_bits data) noexcept + { + return data <= bias_change ? typename source_traits::bits_type{} + : limit_exponent(data - bias_change); + } + + static constexpr source_bits limit_exponent(source_bits data) noexcept + { + return data >= static_cast(result_traits::exponent_mask) + ? static_cast(result_traits::exponent_mask) + : data; + } +}; + + +} // namespace detail + +#ifdef SYCL_LANGUAGE_VERSION +using half = sycl::half; +#else +/** + * A class providing basic support for half precision floating point types. + * + * For now the only features are reduced storage compared to single precision + * and conversions from and to single precision floating point type. + */ +class half { +public: + GKO_ATTRIBUTES half() noexcept = default; + + template ::value>> + GKO_ATTRIBUTES half(const T val) + { + this->float2half(static_cast(val)); + } + + GKO_ATTRIBUTES half(const half& val) = default; + + template + GKO_ATTRIBUTES half& operator=(const V val) + { + this->float2half(static_cast(val)); + return *this; + } + + GKO_ATTRIBUTES operator float() const noexcept + { +#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) + return __half2float(reinterpret_cast(data_)); +#else // defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) + const auto bits = half2float(data_); + return reinterpret_cast(bits); +#endif // defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) + } + + // can not use half operator _op(const half) for half + half + // operation will cast it to float and then do float operation such that it + // becomes float in the end. +#define HALF_OPERATOR(_op, _opeq) \ + GKO_ATTRIBUTES friend half operator _op(const half lhf, const half rhf) \ + { \ + return static_cast(static_cast(lhf) \ + _op static_cast(rhf)); \ + } \ + GKO_ATTRIBUTES half& operator _opeq(const half& hf) \ + { \ + auto result = *this _op hf; \ + this->float2half(result); \ + return *this; \ + } + HALF_OPERATOR(+, +=) + HALF_OPERATOR(-, -=) + HALF_OPERATOR(*, *=) + HALF_OPERATOR(/, /=) + + // Do operation with different type + // If it is floating point, using floating point as type. + // If it is integer, using half as type +#define HALF_FRIEND_OPERATOR(_op, _opeq) \ + template \ + GKO_ATTRIBUTES friend std::enable_if_t< \ + !std::is_same::value && std::is_scalar::value, \ + typename std::conditional::value, T, \ + half>::type> \ + operator _op(const half hf, const T val) \ + { \ + using type = \ + typename std::conditional::value, T, \ + half>::type; \ + auto result = static_cast(hf); \ + result _opeq static_cast(val); \ + return result; \ + } \ + template \ + GKO_ATTRIBUTES friend std::enable_if_t< \ + !std::is_same::value && std::is_scalar::value, \ + typename std::conditional::value, T, \ + half>::type> \ + operator _op(const T val, const half hf) \ + { \ + using type = \ + typename std::conditional::value, T, \ + half>::type; \ + auto result = static_cast(val); \ + result _opeq static_cast(hf); \ + return result; \ + } + + HALF_FRIEND_OPERATOR(+, +=) + HALF_FRIEND_OPERATOR(-, -=) + HALF_FRIEND_OPERATOR(*, *=) + HALF_FRIEND_OPERATOR(/, /=) + + // the negative + GKO_ATTRIBUTES half operator-() const + { + auto val = 0.0f - *this; + return half(val); + } + +private: + using f16_traits = detail::float_traits; + using f32_traits = detail::float_traits; + + // TODO: do we really need this one? + // Without it, everything can be constexpr, which might make stuff easier. + GKO_ATTRIBUTES void float2half(float val) noexcept + { +#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) + const auto tmp = __float2half_rn(val); + data_ = reinterpret_cast(tmp); +#else // defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) + data_ = float2half(reinterpret_cast(val)); +#endif // defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) + } + + static GKO_ATTRIBUTES uint16 float2half(uint32 data_) noexcept + { + using conv = detail::precision_converter; + if (f32_traits::is_inf(data_)) { + return conv::shift_sign(data_) | f16_traits::exponent_mask; + } else if (f32_traits::is_nan(data_)) { + return conv::shift_sign(data_) | f16_traits::exponent_mask | + f16_traits::significand_mask; + } else { + const auto exp = conv::shift_exponent(data_); + if (f16_traits::is_inf(exp)) { + return conv::shift_sign(data_) | exp; + } else if (f16_traits::is_denom(exp)) { + // TODO: handle denormals + return conv::shift_sign(data_); + } else { + return conv::shift_sign(data_) | exp | + conv::shift_significand(data_); + } + } + } + + static GKO_ATTRIBUTES uint32 half2float(uint16 data_) noexcept + { + using conv = detail::precision_converter; + if (f16_traits::is_inf(data_)) { + return conv::shift_sign(data_) | f32_traits::exponent_mask; + } else if (f16_traits::is_nan(data_)) { + return conv::shift_sign(data_) | f32_traits::exponent_mask | + f32_traits::significand_mask; + } else if (f16_traits::is_denom(data_)) { + // TODO: handle denormals + return conv::shift_sign(data_); + } else { + return conv::shift_sign(data_) | conv::shift_exponent(data_) | + conv::shift_significand(data_); + } + } + + uint16 data_; +}; +#endif + + +} // namespace gko + + +namespace std { + + +template <> +class complex { +public: + using value_type = gko::half; + + complex(const value_type& real = value_type(0.f), + const value_type& imag = value_type(0.f)) + : real_(real), imag_(imag) + {} + template ::value && + std::is_scalar::value>> + explicit complex(const T& real, const U& imag) + : complex(static_cast(real), static_cast(imag)) + {} + + template ::value>> + complex(const T& real) : complex(static_cast(real)) + {} + + template ::value>> + explicit complex(const complex& other) + : complex(static_cast(other.real()), + static_cast(other.imag())) + {} + + // explicit complex(const complex& other) = default; + + value_type real() const noexcept { return real_; } + + value_type imag() const noexcept { return imag_; } + + + operator std::complex() const noexcept + { + return std::complex(static_cast(real_), + static_cast(imag_)); + } + + // operator std::complex() const noexcept + // { + // return std::complex(static_cast(real_), + // static_cast(imag_)); + // } + + template + complex& operator=(const V& val) + { + real_ = val; + imag_ = value_type(); + return *this; + } + + template + complex& operator=(const std::complex& val) + { + real_ = val.real(); + imag_ = val.imag(); + return *this; + } + + complex& operator+=(const value_type& real) + { + real_ += real; + return *this; + } + complex& operator-=(const value_type& real) + { + real_ -= real; + return *this; + } + complex& operator*=(const value_type& real) + { + real_ *= real; + imag_ *= real; + return *this; + } + complex& operator/=(const value_type& real) + { + real_ /= real; + imag_ /= real; + return *this; + } + + template + complex& operator+=(const complex& val) + { + real_ += val.real(); + imag_ += val.imag(); + return *this; + } + template + complex& operator-=(const complex& val) + { + real_ -= val.real(); + imag_ -= val.imag(); + return *this; + } + template + complex& operator*=(const complex& val) + { + auto tmp = real_; + real_ = real_ * val.real() - imag_ * val.imag(); + imag_ = tmp * val.imag() + imag_ * val.real(); + return *this; + } + template + complex& operator/=(const complex& val) + { + auto real = val.real(); + auto imag = val.imag(); + (*this) *= complex{val.real(), -val.imag()}; + (*this) /= (real * real + imag * imag); + return *this; + } + +// It's for MacOS. +// TODO: check whether mac compiler always use complex version even when real +// half +#define COMPLEX_HALF_OPERATOR(_op, _opeq) \ + GKO_ATTRIBUTES friend complex operator _op( \ + const complex lhf, const complex rhf) \ + { \ + auto a = lhf; \ + a _opeq rhf; \ + return a; \ + } + + COMPLEX_HALF_OPERATOR(+, +=) + COMPLEX_HALF_OPERATOR(-, -=) + COMPLEX_HALF_OPERATOR(*, *=) + COMPLEX_HALF_OPERATOR(/, /=) + +private: + value_type real_; + value_type imag_; +}; + +#ifndef SYCL_LANGUAGE_VERSION +template <> +struct numeric_limits { + static constexpr bool is_specialized{true}; + static constexpr bool is_signed{true}; + static constexpr bool is_integer{false}; + static constexpr bool is_exact{false}; + static constexpr bool is_bounded{true}; + static constexpr bool is_modulo{false}; + static constexpr int digits{ + gko::detail::float_traits::significand_bits + 1}; + // 3/10 is approx. log_10(2) + static constexpr int digits10{digits * 3 / 10}; + + // Note: gko::half can't return gko::half here because it does not have + // a constexpr constructor. + static constexpr float epsilon() + { + return gko::detail::float_traits::eps; + } + + static constexpr float infinity() + { + return numeric_limits::infinity(); + } + + static constexpr float min() { return numeric_limits::min(); } + + static constexpr float max() { return numeric_limits::max(); } + + static constexpr float quiet_NaN() + { + return numeric_limits::quiet_NaN(); + } +}; + +#endif + +// complex using a template on operator= for any kind of complex, so we can +// do full specialization for half +template <> +inline complex& complex::operator=( + const std::complex& a) +{ + complex t(a.real(), a.imag()); + operator=(t); + return *this; +} + + +// For MSVC +template <> +inline complex& complex::operator=( + const std::complex& a) +{ + complex t(a.real(), a.imag()); + operator=(t); + return *this; +} + + +} // namespace std + -#endif // GKO_BASE_HALF_HPP_ +#endif // GKO_PUBLIC_CORE_BASE_HALF_HPP_ diff --git a/include/ginkgo/core/base/types.hpp b/include/ginkgo/core/base/types.hpp index 8e88828ad29..290c3a91890 100644 --- a/include/ginkgo/core/base/types.hpp +++ b/include/ginkgo/core/base/types.hpp @@ -50,6 +50,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #endif // __HIPCC__ +#ifdef SYCL_LANGUAGE_VERSION +#include +#endif // Macros for handling different compilers / architectures uniformly #if defined(__CUDACC__) || defined(__HIPCC__) @@ -156,8 +159,11 @@ using uint64 = std::uint64_t; */ using uintptr = std::uintptr_t; - +#ifdef SYCL_LANGUAGE_VERSION +using half = sycl::half; +#else class half; +#endif /** @@ -850,5 +856,5 @@ using comm_index_type = int; } // namespace experimental } // namespace gko -#include "core/base/extended_float.hpp" +#include #endif // GKO_PUBLIC_CORE_BASE_TYPES_HPP_ From 9480b500127cfe1fec777c9f6185a820951bfd3e Mon Sep 17 00:00:00 2001 From: "Yuhsiang M. Tsai" Date: Tue, 7 Feb 2023 15:36:55 +0100 Subject: [PATCH 15/48] limit the next precision in test and benchmark next_precision = float not half --- benchmark/utils/types.hpp | 29 +++++++++++++ core/test/utils.hpp | 30 +++++++++++++ reference/test/base/combination.cpp | 8 ++-- reference/test/base/composition.cpp | 8 ++-- reference/test/base/perturbation.cpp | 8 ++-- reference/test/matrix/coo_kernels.cpp | 22 ++++------ reference/test/matrix/csr_kernels.cpp | 16 ++++--- reference/test/matrix/dense_kernels.cpp | 16 ++++--- reference/test/matrix/diagonal_kernels.cpp | 12 +++--- reference/test/matrix/ell_kernels.cpp | 42 +++++++++---------- reference/test/matrix/fbcsr_kernels.cpp | 8 ++-- reference/test/matrix/hybrid_kernels.cpp | 16 ++++--- reference/test/matrix/identity.cpp | 2 +- reference/test/matrix/sellp_kernels.cpp | 12 +++--- .../test/matrix/sparsity_csr_kernels.cpp | 11 +++-- reference/test/preconditioner/ic.cpp | 10 ++--- reference/test/preconditioner/ilu.cpp | 10 ++--- reference/test/preconditioner/jacobi.cpp | 2 +- .../test/preconditioner/jacobi_kernels.cpp | 8 ++-- reference/test/reorder/scaled_reordered.cpp | 4 +- reference/test/solver/bicg_kernels.cpp | 10 ++--- reference/test/solver/bicgstab_kernels.cpp | 10 ++--- reference/test/solver/cb_gmres_kernels.cpp | 10 ++--- reference/test/solver/cg_kernels.cpp | 10 ++--- reference/test/solver/cgs_kernels.cpp | 10 ++--- reference/test/solver/fcg_kernels.cpp | 10 ++--- reference/test/solver/gmres_kernels.cpp | 10 ++--- reference/test/solver/idr_kernels.cpp | 10 ++--- reference/test/solver/ir_kernels.cpp | 8 ++-- reference/test/solver/lower_trs_kernels.cpp | 8 ++-- reference/test/solver/multigrid_kernels.cpp | 2 +- reference/test/solver/upper_trs_kernels.cpp | 8 ++-- test/matrix/matrix.cpp | 2 +- test/mpi/matrix.cpp | 4 +- test/mpi/solver/solver.cpp | 4 +- test/mpi/vector.cpp | 4 +- test/solver/solver.cpp | 2 +- 37 files changed, 220 insertions(+), 176 deletions(-) diff --git a/benchmark/utils/types.hpp b/benchmark/utils/types.hpp index 6ac57ad23c2..acd0c6cb8a2 100644 --- a/benchmark/utils/types.hpp +++ b/benchmark/utils/types.hpp @@ -67,4 +67,33 @@ using etype = double; using rc_etype = gko::remove_complex; +namespace detail { + + +// singly linked list of all our supported precisions +template +struct next_precision_impl {}; + +template <> +struct next_precision_impl { + using type = double; +}; + +template <> +struct next_precision_impl { + using type = float; +}; + + +template +struct next_precision_impl> { + using type = std::complex::type>; +}; + + +} // namespace detail + +template +using next_precision = typename detail::next_precision_impl::type; + #endif // GKO_BENCHMARK_UTILS_TYPES_HPP_ diff --git a/core/test/utils.hpp b/core/test/utils.hpp index f2c3b33a02f..874daafb137 100644 --- a/core/test/utils.hpp +++ b/core/test/utils.hpp @@ -303,4 +303,34 @@ struct TupleTypenameNameGenerator { }; +namespace detail { + + +// singly linked list of all our supported precisions +template +struct next_precision_impl {}; + +template <> +struct next_precision_impl { + using type = double; +}; + +template <> +struct next_precision_impl { + using type = float; +}; + + +template +struct next_precision_impl> { + using type = std::complex::type>; +}; + + +} // namespace detail + +template +using next_precision = typename detail::next_precision_impl::type; + + #endif // GKO_CORE_TEST_UTILS_HPP_ diff --git a/reference/test/base/combination.cpp b/reference/test/base/combination.cpp index 0789b446d23..1c6736c289b 100644 --- a/reference/test/base/combination.cpp +++ b/reference/test/base/combination.cpp @@ -147,7 +147,7 @@ TYPED_TEST(Combination, AppliesToMixedVector) cmb = [ 8 7 ] [ 5 4 ] */ - using value_type = gko::next_precision; + using value_type = next_precision; using Mtx = gko::matrix::Dense; auto cmb = gko::Combination::create( this->coefficients[0], this->operators[0], this->coefficients[1], @@ -189,7 +189,7 @@ TYPED_TEST(Combination, AppliesToMixedComplexVector) cmb = [ 8 7 ] [ 5 4 ] */ - using value_type = gko::to_complex>; + using value_type = gko::to_complex>; using Mtx = gko::matrix::Dense; auto cmb = gko::Combination::create( this->coefficients[0], this->operators[0], this->coefficients[1], @@ -233,7 +233,7 @@ TYPED_TEST(Combination, AppliesLinearCombinationToMixedVector) cmb = [ 8 7 ] [ 5 4 ] */ - using value_type = gko::next_precision; + using value_type = next_precision; using Mtx = gko::matrix::Dense; auto cmb = gko::Combination::create( this->coefficients[0], this->operators[0], this->coefficients[1], @@ -281,7 +281,7 @@ TYPED_TEST(Combination, AppliesLinearCombinationToMixedComplexVector) cmb = [ 8 7 ] [ 5 4 ] */ - using MixedDense = gko::matrix::Dense>; + using MixedDense = gko::matrix::Dense>; using MixedDenseComplex = gko::to_complex; using value_type = typename MixedDenseComplex::value_type; auto cmb = gko::Combination::create( diff --git a/reference/test/base/composition.cpp b/reference/test/base/composition.cpp index 0b89606dd9d..019a2eb4cbf 100644 --- a/reference/test/base/composition.cpp +++ b/reference/test/base/composition.cpp @@ -175,7 +175,7 @@ TYPED_TEST(Composition, AppliesSingleToMixedVector) cmp = [ -9 -2 ] [ 27 26 ] */ - using Mtx = gko::matrix::Dense>; + using Mtx = gko::matrix::Dense>; using value_type = typename Mtx::value_type; auto cmp = gko::Composition::create(this->product); auto x = gko::initialize({1.0, 2.0}, this->exec); @@ -215,7 +215,7 @@ TYPED_TEST(Composition, AppliesSingleToMixedComplexVector) cmp = [ -9 -2 ] [ 27 26 ] */ - using value_type = gko::next_precision>; + using value_type = next_precision>; using Mtx = gko::matrix::Dense; auto cmp = gko::Composition::create(this->product); auto x = gko::initialize( @@ -255,7 +255,7 @@ TYPED_TEST(Composition, AppliesSingleLinearCombinationToMixedVector) cmp = [ -9 -2 ] [ 27 26 ] */ - using value_type = gko::next_precision; + using value_type = next_precision; using Mtx = gko::matrix::Dense; auto cmp = gko::Composition::create(this->product); auto alpha = gko::initialize({3.0}, this->exec); @@ -300,7 +300,7 @@ TYPED_TEST(Composition, AppliesSingleLinearCombinationToMixedComplexVector) cmp = [ -9 -2 ] [ 27 26 ] */ - using MixedDense = gko::matrix::Dense>; + using MixedDense = gko::matrix::Dense>; using MixedDenseComplex = gko::to_complex; using value_type = typename MixedDenseComplex::value_type; auto cmp = gko::Composition::create(this->product); diff --git a/reference/test/base/perturbation.cpp b/reference/test/base/perturbation.cpp index 45483112b7c..b82d1485f93 100644 --- a/reference/test/base/perturbation.cpp +++ b/reference/test/base/perturbation.cpp @@ -134,7 +134,7 @@ TYPED_TEST(Perturbation, AppliesToMixedVector) cmp = I + 2 * [ 2 ] * [ 3 2 ] [ 1 ] */ - using Mtx = gko::matrix::Dense>; + using Mtx = gko::matrix::Dense>; using value_type = typename Mtx::value_type; auto cmp = gko::Perturbation::create(this->scalar, this->basis, this->projector); @@ -176,7 +176,7 @@ TYPED_TEST(Perturbation, AppliesToMixedComplexVector) cmp = I + 2 * [ 2 ] * [ 3 2 ] [ 1 ] */ - using value_type = gko::to_complex>; + using value_type = gko::to_complex>; using Mtx = gko::matrix::Dense; auto cmp = gko::Perturbation::create(this->scalar, this->basis, this->projector); @@ -218,7 +218,7 @@ TYPED_TEST(Perturbation, AppliesLinearCombinationToMixedVector) cmp = I + 2 * [ 2 ] * [ 3 2 ] [ 1 ] */ - using value_type = gko::next_precision; + using value_type = next_precision; using Mtx = gko::matrix::Dense; auto cmp = gko::Perturbation::create(this->scalar, this->basis, this->projector); @@ -265,7 +265,7 @@ TYPED_TEST(Perturbation, AppliesLinearCombinationToMixedComplexVector) cmp = I + 2 * [ 2 ] * [ 3 2 ] [ 1 ] */ - using MixedDense = gko::matrix::Dense>; + using MixedDense = gko::matrix::Dense>; using MixedDenseComplex = gko::to_complex; using value_type = typename MixedDenseComplex::value_type; auto cmp = gko::Perturbation::create(this->scalar, this->basis, diff --git a/reference/test/matrix/coo_kernels.cpp b/reference/test/matrix/coo_kernels.cpp index 8a9061eb09b..8c5d88ea488 100644 --- a/reference/test/matrix/coo_kernels.cpp +++ b/reference/test/matrix/coo_kernels.cpp @@ -64,7 +64,7 @@ class Coo : public ::testing::Test { using Csr = gko::matrix::Csr; using Mtx = gko::matrix::Coo; using Vec = gko::matrix::Dense; - using MixedVec = gko::matrix::Dense>; + using MixedVec = gko::matrix::Dense>; Coo() : exec(gko::ReferenceExecutor::create()), mtx(Mtx::create(exec)) { @@ -111,7 +111,7 @@ TYPED_TEST(Coo, ConvertsToPrecision) { using ValueType = typename TestFixture::value_type; using IndexType = typename TestFixture::index_type; - using OtherType = typename gko::next_precision; + using OtherType = next_precision; using Coo = typename TestFixture::Mtx; using OtherCoo = gko::matrix::Coo; auto tmp = OtherCoo::create(this->exec); @@ -134,7 +134,7 @@ TYPED_TEST(Coo, MovesToPrecision) { using ValueType = typename TestFixture::value_type; using IndexType = typename TestFixture::index_type; - using OtherType = typename gko::next_precision; + using OtherType = next_precision; using Coo = typename TestFixture::Mtx; using OtherCoo = gko::matrix::Coo; auto tmp = OtherCoo::create(this->exec); @@ -248,7 +248,7 @@ TYPED_TEST(Coo, ConvertsEmptyToPrecision) { using ValueType = typename TestFixture::value_type; using IndexType = typename TestFixture::index_type; - using OtherType = typename gko::next_precision; + using OtherType = next_precision; using Coo = typename TestFixture::Mtx; using OtherCoo = gko::matrix::Coo; auto empty = OtherCoo::create(this->exec); @@ -265,7 +265,7 @@ TYPED_TEST(Coo, MovesEmptyToPrecision) { using ValueType = typename TestFixture::value_type; using IndexType = typename TestFixture::index_type; - using OtherType = typename gko::next_precision; + using OtherType = next_precision; using Coo = typename TestFixture::Mtx; using OtherCoo = gko::matrix::Coo; auto empty = OtherCoo::create(this->exec); @@ -735,8 +735,7 @@ TYPED_TEST(Coo, AppliesToComplex) TYPED_TEST(Coo, AppliesToMixedComplex) { - using mixed_value_type = - gko::next_precision; + using mixed_value_type = next_precision; using mixed_complex_type = gko::to_complex; using Vec = gko::matrix::Dense; auto exec = gko::ReferenceExecutor::create(); @@ -791,8 +790,7 @@ TYPED_TEST(Coo, AdvancedAppliesToComplex) TYPED_TEST(Coo, AdvancedAppliesToMixedComplex) { - using mixed_value_type = - gko::next_precision; + using mixed_value_type = next_precision; using mixed_complex_type = gko::to_complex; using MixedDense = gko::matrix::Dense; using MixedDenseComplex = gko::matrix::Dense; @@ -849,8 +847,7 @@ TYPED_TEST(Coo, ApplyAddsToComplex) TYPED_TEST(Coo, ApplyAddsToMixedComplex) { - using mixed_value_type = - gko::next_precision; + using mixed_value_type = next_precision; using mixed_complex_type = gko::to_complex; using MixedVec = gko::matrix::Dense; auto exec = gko::ReferenceExecutor::create(); @@ -906,8 +903,7 @@ TYPED_TEST(Coo, ApplyAddsScaledToComplex) TYPED_TEST(Coo, ApplyAddsScaledToMixedComplex) { - using mixed_value_type = - gko::next_precision; + using mixed_value_type = next_precision; using mixed_complex_type = gko::to_complex; using MixedDense = gko::matrix::Dense; using MixedDenseComplex = gko::matrix::Dense; diff --git a/reference/test/matrix/csr_kernels.cpp b/reference/test/matrix/csr_kernels.cpp index 596f103cb21..f9f58b4bf28 100644 --- a/reference/test/matrix/csr_kernels.cpp +++ b/reference/test/matrix/csr_kernels.cpp @@ -75,7 +75,7 @@ class Csr : public ::testing::Test { using Ell = gko::matrix::Ell; using Hybrid = gko::matrix::Hybrid; using Vec = gko::matrix::Dense; - using MixedVec = gko::matrix::Dense>; + using MixedVec = gko::matrix::Dense>; Csr() : exec(gko::ReferenceExecutor::create()), @@ -793,7 +793,7 @@ TYPED_TEST(Csr, ConvertsToPrecision) { using ValueType = typename TestFixture::value_type; using IndexType = typename TestFixture::index_type; - using OtherType = typename gko::next_precision; + using OtherType = next_precision; using Csr = typename TestFixture::Mtx; using OtherCsr = gko::matrix::Csr; auto tmp = OtherCsr::create(this->exec); @@ -820,7 +820,7 @@ TYPED_TEST(Csr, MovesToPrecision) { using ValueType = typename TestFixture::value_type; using IndexType = typename TestFixture::index_type; - using OtherType = typename gko::next_precision; + using OtherType = next_precision; using Csr = typename TestFixture::Mtx; using OtherCsr = gko::matrix::Csr; auto tmp = OtherCsr::create(this->exec); @@ -999,7 +999,7 @@ TYPED_TEST(Csr, ConvertsEmptyToPrecision) { using ValueType = typename TestFixture::value_type; using IndexType = typename TestFixture::index_type; - using OtherType = typename gko::next_precision; + using OtherType = next_precision; using Csr = typename TestFixture::Mtx; using OtherCsr = gko::matrix::Csr; auto empty = OtherCsr::create(this->exec); @@ -1018,7 +1018,7 @@ TYPED_TEST(Csr, MovesEmptyToPrecision) { using ValueType = typename TestFixture::value_type; using IndexType = typename TestFixture::index_type; - using OtherType = typename gko::next_precision; + using OtherType = next_precision; using Csr = typename TestFixture::Mtx; using OtherCsr = gko::matrix::Csr; auto empty = OtherCsr::create(this->exec); @@ -1611,8 +1611,7 @@ TYPED_TEST(Csr, AppliesToComplex) TYPED_TEST(Csr, AppliesToMixedComplex) { - using mixed_value_type = - gko::next_precision; + using mixed_value_type = next_precision; using mixed_complex_type = gko::to_complex; using Vec = gko::matrix::Dense; auto exec = gko::ReferenceExecutor::create(); @@ -1667,8 +1666,7 @@ TYPED_TEST(Csr, AdvancedAppliesToComplex) TYPED_TEST(Csr, AdvancedAppliesToMixedComplex) { - using mixed_value_type = - gko::next_precision; + using mixed_value_type = next_precision; using mixed_complex_type = gko::to_complex; using MixedDense = gko::matrix::Dense; using MixedDenseComplex = gko::matrix::Dense; diff --git a/reference/test/matrix/dense_kernels.cpp b/reference/test/matrix/dense_kernels.cpp index a9105279626..bb90097afa1 100644 --- a/reference/test/matrix/dense_kernels.cpp +++ b/reference/test/matrix/dense_kernels.cpp @@ -66,7 +66,7 @@ class Dense : public ::testing::Test { protected: using value_type = T; using Mtx = gko::matrix::Dense; - using MixedMtx = gko::matrix::Dense>; + using MixedMtx = gko::matrix::Dense>; using ComplexMtx = gko::to_complex; using MixedComplexMtx = gko::to_complex; using RealMtx = gko::remove_complex; @@ -744,7 +744,7 @@ TYPED_TEST(Dense, ConvertsToPrecision) { using Dense = typename TestFixture::Mtx; using T = typename TestFixture::value_type; - using OtherT = typename gko::next_precision; + using OtherT = next_precision; using OtherDense = typename gko::matrix::Dense; auto tmp = OtherDense::create(this->exec); auto res = Dense::create(this->exec); @@ -766,7 +766,7 @@ TYPED_TEST(Dense, MovesToPrecision) { using Dense = typename TestFixture::Mtx; using T = typename TestFixture::value_type; - using OtherT = typename gko::next_precision; + using OtherT = next_precision; using OtherDense = typename gko::matrix::Dense; auto tmp = OtherDense::create(this->exec); auto res = Dense::create(this->exec); @@ -1886,7 +1886,7 @@ TYPED_TEST(Dense, ConvertsEmptyToPrecision) { using Dense = typename TestFixture::Mtx; using T = typename TestFixture::value_type; - using OtherT = typename gko::next_precision; + using OtherT = next_precision; using OtherDense = typename gko::matrix::Dense; auto empty = OtherDense::create(this->exec); auto res = Dense::create(this->exec); @@ -1901,7 +1901,7 @@ TYPED_TEST(Dense, MovesEmptyToPrecision) { using Dense = typename TestFixture::Mtx; using T = typename TestFixture::value_type; - using OtherT = typename gko::next_precision; + using OtherT = next_precision; using OtherDense = typename gko::matrix::Dense; auto empty = OtherDense::create(this->exec); auto res = Dense::create(this->exec); @@ -3592,8 +3592,7 @@ TYPED_TEST(Dense, AppliesToComplex) TYPED_TEST(Dense, AppliesToMixedComplex) { - using mixed_value_type = - gko::next_precision; + using mixed_value_type = next_precision; using mixed_complex_type = gko::to_complex; using Vec = gko::matrix::Dense; auto exec = gko::ReferenceExecutor::create(); @@ -3646,8 +3645,7 @@ TYPED_TEST(Dense, AdvancedAppliesToComplex) TYPED_TEST(Dense, AdvancedAppliesToMixedComplex) { - using mixed_value_type = - gko::next_precision; + using mixed_value_type = next_precision; using mixed_complex_type = gko::to_complex; using MixedDense = gko::matrix::Dense; using MixedDenseComplex = gko::matrix::Dense; diff --git a/reference/test/matrix/diagonal_kernels.cpp b/reference/test/matrix/diagonal_kernels.cpp index 4a043555a08..6ee00be3e47 100644 --- a/reference/test/matrix/diagonal_kernels.cpp +++ b/reference/test/matrix/diagonal_kernels.cpp @@ -62,7 +62,7 @@ class Diagonal : public ::testing::Test { using Csr = gko::matrix::Csr; using Diag = gko::matrix::Diagonal; using Dense = gko::matrix::Dense; - using MixedDense = gko::matrix::Dense>; + using MixedDense = gko::matrix::Dense>; Diagonal() : exec(gko::ReferenceExecutor::create()), @@ -117,7 +117,7 @@ TYPED_TEST_SUITE(Diagonal, gko::test::ValueTypes, TypenameNameGenerator); TYPED_TEST(Diagonal, ConvertsToPrecision) { using ValueType = typename TestFixture::value_type; - using OtherType = typename gko::next_precision; + using OtherType = next_precision; using Diagonal = typename TestFixture::Diag; using OtherDiagonal = gko::matrix::Diagonal; auto tmp = OtherDiagonal::create(this->exec); @@ -139,7 +139,7 @@ TYPED_TEST(Diagonal, ConvertsToPrecision) TYPED_TEST(Diagonal, MovesToPrecision) { using ValueType = typename TestFixture::value_type; - using OtherType = typename gko::next_precision; + using OtherType = next_precision; using Diagonal = typename TestFixture::Diag; using OtherDiagonal = gko::matrix::Diagonal; auto tmp = OtherDiagonal::create(this->exec); @@ -606,8 +606,7 @@ TYPED_TEST(Diagonal, AppliesToComplex) TYPED_TEST(Diagonal, AppliesToMixedComplex) { - using mixed_value_type = - gko::next_precision; + using mixed_value_type = next_precision; using mixed_complex_type = gko::to_complex; using Vec = gko::matrix::Dense; auto exec = gko::ReferenceExecutor::create(); @@ -666,8 +665,7 @@ TYPED_TEST(Diagonal, AppliesLinearCombinationToComplex) TYPED_TEST(Diagonal, AppliesLinearCombinationToMixedComplex) { - using mixed_value_type = - gko::next_precision; + using mixed_value_type = next_precision; using mixed_complex_type = gko::to_complex; using Vec = gko::matrix::Dense; using Scalar = gko::matrix::Dense; diff --git a/reference/test/matrix/ell_kernels.cpp b/reference/test/matrix/ell_kernels.cpp index b76487b9fb2..0c6103916a3 100644 --- a/reference/test/matrix/ell_kernels.cpp +++ b/reference/test/matrix/ell_kernels.cpp @@ -63,7 +63,7 @@ class Ell : public ::testing::Test { using Mtx = gko::matrix::Ell; using Csr = gko::matrix::Csr; using Vec = gko::matrix::Dense; - using MixedVec = gko::matrix::Dense>; + using MixedVec = gko::matrix::Dense>; Ell() : exec(gko::ReferenceExecutor::create()), @@ -124,7 +124,7 @@ TYPED_TEST(Ell, MixedAppliesToDenseVector1) { // Both vectors have the same value type which differs from the matrix using T = typename TestFixture::value_type; - using next_T = gko::next_precision; + using next_T = next_precision; using Vec = typename gko::matrix::Dense; auto x = gko::initialize({2.0, 1.0, 4.0}, this->exec); auto y = Vec::create(this->exec, gko::dim<2>{2, 1}); @@ -139,7 +139,7 @@ TYPED_TEST(Ell, MixedAppliesToDenseVector2) { // Input vector has same value type as matrix using T = typename TestFixture::value_type; - using next_T = gko::next_precision; + using next_T = next_precision; using Vec1 = typename TestFixture::Vec; using Vec2 = gko::matrix::Dense; auto x = gko::initialize({2.0, 1.0, 4.0}, this->exec); @@ -155,9 +155,9 @@ TYPED_TEST(Ell, MixedAppliesToDenseVector3) { // Output vector has same value type as matrix using T = typename TestFixture::value_type; - using next_T = gko::next_precision; + using next_T = next_precision; using Vec1 = typename TestFixture::Vec; - using Vec2 = gko::matrix::Dense>; + using Vec2 = gko::matrix::Dense>; auto x = gko::initialize({2.0, 1.0, 4.0}, this->exec); auto y = Vec1::create(this->exec, gko::dim<2>{2, 1}); @@ -193,7 +193,7 @@ TYPED_TEST(Ell, MixedAppliesToDenseMatrix1) { // Both vectors have the same value type which differs from the matrix using T = typename TestFixture::value_type; - using next_T = gko::next_precision; + using next_T = next_precision; using Vec = gko::matrix::Dense; // clang-format off auto x = gko::initialize( @@ -217,7 +217,7 @@ TYPED_TEST(Ell, MixedAppliesToDenseMatrix2) { // Input vector has same value type as matrix using T = typename TestFixture::value_type; - using next_T = gko::next_precision; + using next_T = next_precision; using Vec1 = typename TestFixture::Vec; using Vec2 = gko::matrix::Dense; // clang-format off @@ -242,7 +242,7 @@ TYPED_TEST(Ell, MixedAppliesToDenseMatrix3) { // Output vector has same value type as matrix using T = typename TestFixture::value_type; - using next_T = gko::next_precision; + using next_T = next_precision; using Vec1 = typename TestFixture::Vec; using Vec2 = gko::matrix::Dense; // clang-format off @@ -281,7 +281,7 @@ TYPED_TEST(Ell, MixedAppliesLinearCombinationToDenseVector1) { // Both vectors have the same value type which differs from the matrix using T = typename TestFixture::value_type; - using next_T = gko::next_precision; + using next_T = next_precision; using Vec = gko::matrix::Dense; auto alpha = gko::initialize({-1.0}, this->exec); auto beta = gko::initialize({2.0}, this->exec); @@ -298,7 +298,7 @@ TYPED_TEST(Ell, MixedAppliesLinearCombinationToDenseVector2) { // Input vector has same value type as matrix using T = typename TestFixture::value_type; - using next_T = gko::next_precision; + using next_T = next_precision; using Vec1 = typename TestFixture::Vec; using Vec2 = gko::matrix::Dense; auto alpha = gko::initialize({-1.0}, this->exec); @@ -316,7 +316,7 @@ TYPED_TEST(Ell, MixedAppliesLinearCombinationToDenseVector3) { // Output vector has same value type as matrix using T = typename TestFixture::value_type; - using next_T = gko::next_precision; + using next_T = next_precision; using Vec1 = typename TestFixture::Vec; using Vec2 = gko::matrix::Dense; auto alpha = gko::initialize({-1.0}, this->exec); @@ -360,7 +360,7 @@ TYPED_TEST(Ell, MixedAppliesLinearCombinationToDenseMatrix1) { // Both vectors have the same value type which differs from the matrix using T = typename TestFixture::value_type; - using next_T = gko::next_precision; + using next_T = next_precision; using Vec = gko::matrix::Dense; auto alpha = gko::initialize({-1.0}, this->exec); auto beta = gko::initialize({2.0}, this->exec); @@ -388,7 +388,7 @@ TYPED_TEST(Ell, MixedAppliesLinearCombinationToDenseMatrix2) { // Input vector has same value type as matrix using T = typename TestFixture::value_type; - using next_T = gko::next_precision; + using next_T = next_precision; using Vec1 = typename TestFixture::Vec; using Vec2 = gko::matrix::Dense; auto alpha = gko::initialize({-1.0}, this->exec); @@ -417,7 +417,7 @@ TYPED_TEST(Ell, MixedAppliesLinearCombinationToDenseMatrix3) { // Output vector has same value type as matrix using T = typename TestFixture::value_type; - using next_T = gko::next_precision; + using next_T = next_precision; using Vec1 = typename TestFixture::Vec; using Vec2 = gko::matrix::Dense; auto alpha = gko::initialize({-1.0}, this->exec); @@ -476,7 +476,7 @@ TYPED_TEST(Ell, ConvertsToPrecision) { using ValueType = typename TestFixture::value_type; using IndexType = typename TestFixture::index_type; - using OtherType = typename gko::next_precision; + using OtherType = next_precision; using Ell = typename TestFixture::Mtx; using OtherEll = gko::matrix::Ell; auto tmp = OtherEll::create(this->exec); @@ -499,7 +499,7 @@ TYPED_TEST(Ell, MovesToPrecision) { using ValueType = typename TestFixture::value_type; using IndexType = typename TestFixture::index_type; - using OtherType = typename gko::next_precision; + using OtherType = next_precision; using Ell = typename TestFixture::Mtx; using OtherEll = gko::matrix::Ell; auto tmp = OtherEll::create(this->exec); @@ -769,7 +769,7 @@ TYPED_TEST(Ell, ConvertsEmptyToPrecision) { using ValueType = typename TestFixture::value_type; using IndexType = typename TestFixture::index_type; - using OtherType = typename gko::next_precision; + using OtherType = next_precision; using Ell = typename TestFixture::Mtx; using OtherEll = gko::matrix::Ell; auto empty = Ell::create(this->exec); @@ -786,7 +786,7 @@ TYPED_TEST(Ell, MovesEmptyToPrecision) { using ValueType = typename TestFixture::value_type; using IndexType = typename TestFixture::index_type; - using OtherType = typename gko::next_precision; + using OtherType = next_precision; using Ell = typename TestFixture::Mtx; using OtherEll = gko::matrix::Ell; auto empty = Ell::create(this->exec); @@ -930,8 +930,7 @@ TYPED_TEST(Ell, AppliesToComplex) TYPED_TEST(Ell, AppliesToMixedComplex) { - using mixed_value_type = - gko::next_precision; + using mixed_value_type = next_precision; using mixed_complex_type = gko::to_complex; using Vec = gko::matrix::Dense; auto exec = gko::ReferenceExecutor::create(); @@ -987,8 +986,7 @@ TYPED_TEST(Ell, AdvancedAppliesToComplex) TYPED_TEST(Ell, AdvancedAppliesToMixedComplex) { - using mixed_value_type = - gko::next_precision; + using mixed_value_type = next_precision; using mixed_complex_type = gko::to_complex; using MixedDense = gko::matrix::Dense; using MixedDenseComplex = gko::matrix::Dense; diff --git a/reference/test/matrix/fbcsr_kernels.cpp b/reference/test/matrix/fbcsr_kernels.cpp index 95a0d2db6ff..2dea452c655 100644 --- a/reference/test/matrix/fbcsr_kernels.cpp +++ b/reference/test/matrix/fbcsr_kernels.cpp @@ -303,7 +303,7 @@ TYPED_TEST(Fbcsr, ConvertsToPrecision) { using ValueType = typename TestFixture::value_type; using IndexType = typename TestFixture::index_type; - using OtherType = typename gko::next_precision; + using OtherType = next_precision; using Fbcsr = typename TestFixture::Mtx; using OtherFbcsr = gko::matrix::Fbcsr; auto tmp = OtherFbcsr::create(this->exec); @@ -326,7 +326,7 @@ TYPED_TEST(Fbcsr, MovesToPrecision) { using ValueType = typename TestFixture::value_type; using IndexType = typename TestFixture::index_type; - using OtherType = typename gko::next_precision; + using OtherType = next_precision; using Fbcsr = typename TestFixture::Mtx; using OtherFbcsr = gko::matrix::Fbcsr; auto tmp = OtherFbcsr::create(this->exec); @@ -425,7 +425,7 @@ TYPED_TEST(Fbcsr, ConvertsEmptyToPrecision) { using ValueType = typename TestFixture::value_type; using IndexType = typename TestFixture::index_type; - using OtherType = typename gko::next_precision; + using OtherType = next_precision; using Fbcsr = typename TestFixture::Mtx; using OtherFbcsr = gko::matrix::Fbcsr; auto empty = OtherFbcsr::create(this->exec); @@ -444,7 +444,7 @@ TYPED_TEST(Fbcsr, MovesEmptyToPrecision) { using ValueType = typename TestFixture::value_type; using IndexType = typename TestFixture::index_type; - using OtherType = typename gko::next_precision; + using OtherType = next_precision; using Fbcsr = typename TestFixture::Mtx; using OtherFbcsr = gko::matrix::Fbcsr; auto empty = OtherFbcsr::create(this->exec); diff --git a/reference/test/matrix/hybrid_kernels.cpp b/reference/test/matrix/hybrid_kernels.cpp index 85086b334c5..9573670ba81 100644 --- a/reference/test/matrix/hybrid_kernels.cpp +++ b/reference/test/matrix/hybrid_kernels.cpp @@ -64,7 +64,7 @@ class Hybrid : public ::testing::Test { using Mtx = gko::matrix::Hybrid; using Vec = gko::matrix::Dense; using Csr = gko::matrix::Csr; - using MixedVec = gko::matrix::Dense>; + using MixedVec = gko::matrix::Dense>; Hybrid() : exec(gko::ReferenceExecutor::create()), @@ -265,7 +265,7 @@ TYPED_TEST(Hybrid, ConvertsToPrecision) { using ValueType = typename TestFixture::value_type; using IndexType = typename TestFixture::index_type; - using OtherType = typename gko::next_precision; + using OtherType = next_precision; using Hybrid = typename TestFixture::Mtx; using OtherHybrid = gko::matrix::Hybrid; auto tmp = OtherHybrid::create(this->exec); @@ -288,7 +288,7 @@ TYPED_TEST(Hybrid, MovesToPrecision) { using ValueType = typename TestFixture::value_type; using IndexType = typename TestFixture::index_type; - using OtherType = typename gko::next_precision; + using OtherType = next_precision; using Hybrid = typename TestFixture::Mtx; using OtherHybrid = gko::matrix::Hybrid; auto tmp = OtherHybrid::create(this->exec); @@ -400,7 +400,7 @@ TYPED_TEST(Hybrid, ConvertsEmptyToPrecision) { using ValueType = typename TestFixture::value_type; using IndexType = typename TestFixture::index_type; - using OtherType = typename gko::next_precision; + using OtherType = next_precision; using Hybrid = typename TestFixture::Mtx; using OtherHybrid = gko::matrix::Hybrid; auto other = Hybrid::create(this->exec); @@ -417,7 +417,7 @@ TYPED_TEST(Hybrid, MovesEmptyToPrecision) { using ValueType = typename TestFixture::value_type; using IndexType = typename TestFixture::index_type; - using OtherType = typename gko::next_precision; + using OtherType = next_precision; using Hybrid = typename TestFixture::Mtx; using OtherHybrid = gko::matrix::Hybrid; auto other = Hybrid::create(this->exec); @@ -731,8 +731,7 @@ TYPED_TEST(Hybrid, AppliesToComplex) TYPED_TEST(Hybrid, AppliesToMixedComplex) { - using mixed_value_type = - gko::next_precision; + using mixed_value_type = next_precision; using mixed_complex_type = gko::to_complex; using Vec = gko::matrix::Dense; auto exec = gko::ReferenceExecutor::create(); @@ -788,8 +787,7 @@ TYPED_TEST(Hybrid, AdvancedAppliesToComplex) TYPED_TEST(Hybrid, AdvancedAppliesToMixedComplex) { - using mixed_value_type = - gko::next_precision; + using mixed_value_type = next_precision; using mixed_complex_type = gko::to_complex; using MixedDense = gko::matrix::Dense; using MixedDenseComplex = gko::matrix::Dense; diff --git a/reference/test/matrix/identity.cpp b/reference/test/matrix/identity.cpp index 6836c0707ed..a406da16b18 100644 --- a/reference/test/matrix/identity.cpp +++ b/reference/test/matrix/identity.cpp @@ -51,7 +51,7 @@ class Identity : public ::testing::Test { using value_type = T; using Id = gko::matrix::Identity; using Vec = gko::matrix::Dense; - using MixedVec = gko::matrix::Dense>; + using MixedVec = gko::matrix::Dense>; using ComplexVec = gko::to_complex; using MixedComplexVec = gko::to_complex; diff --git a/reference/test/matrix/sellp_kernels.cpp b/reference/test/matrix/sellp_kernels.cpp index 1fb65e940c4..7a85e6c46a6 100644 --- a/reference/test/matrix/sellp_kernels.cpp +++ b/reference/test/matrix/sellp_kernels.cpp @@ -98,7 +98,7 @@ TYPED_TEST(Sellp, AppliesToDenseVector) TYPED_TEST(Sellp, AppliesToMixedDenseVector) { - using value_type = gko::next_precision; + using value_type = next_precision; using Vec = gko::matrix::Dense; auto x = gko::initialize({2.0, 1.0, 4.0}, this->exec); auto y = Vec::create(this->exec, gko::dim<2>{2, 1}); @@ -147,7 +147,7 @@ TYPED_TEST(Sellp, AppliesLinearCombinationToDenseVector) TYPED_TEST(Sellp, AppliesLinearCombinationToMixedDenseVector) { - using value_type = gko::next_precision; + using value_type = next_precision; using Vec = gko::matrix::Dense; auto alpha = gko::initialize({-1.0}, this->exec); auto beta = gko::initialize({2.0}, this->exec); @@ -220,7 +220,7 @@ TYPED_TEST(Sellp, ConvertsToPrecision) { using ValueType = typename TestFixture::value_type; using IndexType = typename TestFixture::index_type; - using OtherType = typename gko::next_precision; + using OtherType = next_precision; using Sellp = typename TestFixture::Mtx; using OtherSellp = gko::matrix::Sellp; auto tmp = OtherSellp::create(this->exec); @@ -243,7 +243,7 @@ TYPED_TEST(Sellp, MovesToPrecision) { using ValueType = typename TestFixture::value_type; using IndexType = typename TestFixture::index_type; - using OtherType = typename gko::next_precision; + using OtherType = next_precision; using Sellp = typename TestFixture::Mtx; using OtherSellp = gko::matrix::Sellp; auto tmp = OtherSellp::create(this->exec); @@ -341,7 +341,7 @@ TYPED_TEST(Sellp, ConvertsEmptyToPrecision) { using ValueType = typename TestFixture::value_type; using IndexType = typename TestFixture::index_type; - using OtherType = typename gko::next_precision; + using OtherType = next_precision; using Sellp = typename TestFixture::Mtx; using OtherSellp = gko::matrix::Sellp; auto empty = OtherSellp::create(this->exec); @@ -360,7 +360,7 @@ TYPED_TEST(Sellp, MovesEmptyToPrecision) { using ValueType = typename TestFixture::value_type; using IndexType = typename TestFixture::index_type; - using OtherType = typename gko::next_precision; + using OtherType = next_precision; using Sellp = typename TestFixture::Mtx; using OtherSellp = gko::matrix::Sellp; auto empty = OtherSellp::create(this->exec); diff --git a/reference/test/matrix/sparsity_csr_kernels.cpp b/reference/test/matrix/sparsity_csr_kernels.cpp index dde558d27fd..fa0fc76c947 100644 --- a/reference/test/matrix/sparsity_csr_kernels.cpp +++ b/reference/test/matrix/sparsity_csr_kernels.cpp @@ -177,7 +177,7 @@ TYPED_TEST(SparsityCsr, AppliesToDenseVector) TYPED_TEST(SparsityCsr, AppliesToMixedDenseVector) { - using T = gko::next_precision; + using T = next_precision; using Vec = gko::matrix::Dense; auto x = gko::initialize({2.0, 1.0, 4.0}, this->exec); auto y = Vec::create(this->exec, gko::dim<2>{2, 1}); @@ -224,7 +224,7 @@ TYPED_TEST(SparsityCsr, AppliesLinearCombinationToDenseVector) TYPED_TEST(SparsityCsr, AppliesLinearCombinationToMixedDenseVector) { - using T = gko::next_precision; + using T = next_precision; using Vec = gko::matrix::Dense; auto alpha = gko::initialize({-1.0}, this->exec); auto beta = gko::initialize({2.0}, this->exec); @@ -275,8 +275,7 @@ TYPED_TEST(SparsityCsr, AppliesToComplex) TYPED_TEST(SparsityCsr, AppliesToMixedComplex) { - using T = - gko::next_precision>; + using T = next_precision>; using Vec = gko::matrix::Dense; auto x = gko::initialize({T{2.0, 4.0}, T{1.0, 2.0}, T{4.0, 8.0}}, this->exec); @@ -310,8 +309,8 @@ TYPED_TEST(SparsityCsr, AppliesLinearCombinationToComplex) TYPED_TEST(SparsityCsr, AppliesLinearCombinationToMixedComplex) { - using Vec = gko::matrix::Dense< - gko::next_precision>; + using Vec = + gko::matrix::Dense>; using ComplexVec = gko::to_complex; using T = typename ComplexVec::value_type; auto alpha = gko::initialize({-1.0}, this->exec); diff --git a/reference/test/preconditioner/ic.cpp b/reference/test/preconditioner/ic.cpp index b3f7348adde..aae3f577492 100644 --- a/reference/test/preconditioner/ic.cpp +++ b/reference/test/preconditioner/ic.cpp @@ -278,7 +278,7 @@ TYPED_TEST(Ic, SolvesSingleRhsMixed) { using ic_prec_type = typename TestFixture::ic_prec_type; using T = typename TestFixture::value_type; - using Vec = gko::matrix::Dense>; + using Vec = gko::matrix::Dense>; const auto b = gko::initialize({1.0, 3.0, 6.0}, this->exec); auto x = Vec::create(this->exec, gko::dim<2>{3, 1}); auto preconditioner = @@ -312,7 +312,7 @@ TYPED_TEST(Ic, SolvesSingleRhsComplexMixed) { using ic_prec_type = typename TestFixture::ic_prec_type; using Vec = gko::matrix::Dense< - gko::next_precision>>; + next_precision>>; using T = typename Vec::value_type; const auto b = gko::initialize( {T{1.0, 2.0}, T{3.0, 6.0}, T{6.0, 12.0}}, this->exec); @@ -348,7 +348,7 @@ TYPED_TEST(Ic, AdvancedSolvesSingleRhsMixed) { using ic_prec_type = typename TestFixture::ic_prec_type; using T = typename TestFixture::value_type; - using Vec = gko::matrix::Dense>; + using Vec = gko::matrix::Dense>; const auto b = gko::initialize({1.0, 3.0, 6.0}, this->exec); const auto alpha = gko::initialize({2.0}, this->exec); const auto beta = gko::initialize({-1.0}, this->exec); @@ -387,8 +387,8 @@ TYPED_TEST(Ic, AdvancedSolvesSingleRhsComplex) TYPED_TEST(Ic, AdvancedSolvesSingleRhsComplexMixed) { using ic_prec_type = typename TestFixture::ic_prec_type; - using MixedDense = gko::matrix::Dense< - gko::next_precision>; + using MixedDense = + gko::matrix::Dense>; using MixedDenseComplex = gko::to_complex; using T = typename MixedDenseComplex::value_type; const auto b = gko::initialize( diff --git a/reference/test/preconditioner/ilu.cpp b/reference/test/preconditioner/ilu.cpp index ce3ea72725f..b9ab9683134 100644 --- a/reference/test/preconditioner/ilu.cpp +++ b/reference/test/preconditioner/ilu.cpp @@ -356,8 +356,8 @@ TYPED_TEST(Ilu, SolvesSingleRhsWithMtx) TYPED_TEST(Ilu, SolvesSingleRhsWithMixedMtx) { - using Mtx = gko::matrix::Dense< - gko::next_precision>; + using Mtx = + gko::matrix::Dense>; const auto b = gko::initialize({1.0, 3.0, 6.0}, this->exec); auto x = Mtx::create(this->exec, gko::dim<2>{3, 1}); x->copy_from(b); @@ -391,7 +391,7 @@ TYPED_TEST(Ilu, SolvesSingleRhsWithComplexMtx) TYPED_TEST(Ilu, SolvesSingleRhsWithMixedComplexMtx) { using Mtx = gko::matrix::Dense< - gko::to_complex>>; + gko::to_complex>>; using T = typename Mtx::value_type; const auto b = gko::initialize( {T{1.0, 2.0}, T{3.0, 6.0}, T{6.0, 12.0}}, this->exec); @@ -444,7 +444,7 @@ TYPED_TEST(Ilu, SolvesAdvancedSingleRhs) TYPED_TEST(Ilu, SolvesAdvancedSingleRhsMixed) { - using value_type = gko::next_precision; + using value_type = next_precision; using Mtx = gko::matrix::Dense; const value_type alpha{2.0}; const auto alpha_linop = gko::initialize({alpha}, this->exec); @@ -494,7 +494,7 @@ TYPED_TEST(Ilu, SolvesAdvancedSingleRhsComplex) TYPED_TEST(Ilu, SolvesAdvancedSingleRhsMixedComplex) { - using value_type = gko::next_precision; + using value_type = next_precision; using complex_type = gko::to_complex; using MixedDense = gko::matrix::Dense; using MixedDenseComplex = gko::to_complex; diff --git a/reference/test/preconditioner/jacobi.cpp b/reference/test/preconditioner/jacobi.cpp index 7fb7d85558c..571cc73133a 100644 --- a/reference/test/preconditioner/jacobi.cpp +++ b/reference/test/preconditioner/jacobi.cpp @@ -510,7 +510,7 @@ TYPED_TEST(Jacobi, ScalarJacobiGeneratesOnDifferentPrecision) { using value_type = typename TestFixture::value_type; using index_type = typename TestFixture::index_type; - using next_type = gko::next_precision; + using next_type = next_precision; using Bj = typename TestFixture::Bj; auto csr = gko::share(gko::matrix::Csr::create(this->exec)); diff --git a/reference/test/preconditioner/jacobi_kernels.cpp b/reference/test/preconditioner/jacobi_kernels.cpp index 679a56030da..9a2d3411cbb 100644 --- a/reference/test/preconditioner/jacobi_kernels.cpp +++ b/reference/test/preconditioner/jacobi_kernels.cpp @@ -675,7 +675,7 @@ TYPED_TEST(Jacobi, ScalarJacobiAppliesToVector) TYPED_TEST(Jacobi, AppliesToMixedVector) { - using value_type = gko::next_precision; + using value_type = next_precision; using Vec = gko::matrix::Dense; auto x = gko::initialize({1.0, -1.0, 2.0, -2.0, 3.0}, this->exec); auto b = gko::initialize({4.0, -1.0, -2.0, 4.0, -1.0}, this->exec); @@ -716,7 +716,7 @@ TYPED_TEST(Jacobi, AppliesToComplexVector) TYPED_TEST(Jacobi, AppliesToMixedComplexVector) { using value_type = - gko::to_complex>; + gko::to_complex>; using Vec = gko::matrix::Dense; auto x = gko::initialize( {value_type{1.0, 2.0}, value_type{-1.0, -2.0}, value_type{2.0, 4.0}, @@ -921,7 +921,7 @@ TYPED_TEST(Jacobi, ScalarJacobiAppliesLinearCombinationToVector) TYPED_TEST(Jacobi, AppliesLinearCombinationToMixedVector) { - using value_type = gko::next_precision; + using value_type = next_precision; using Vec = gko::matrix::Dense; auto x = gko::initialize({1.0, -1.0, 2.0, -2.0, 3.0}, this->exec); auto b = gko::initialize({4.0, -1.0, -2.0, 4.0, -1.0}, this->exec); @@ -964,7 +964,7 @@ TYPED_TEST(Jacobi, AppliesLinearCombinationToComplexVector) TYPED_TEST(Jacobi, AppliesLinearCombinationToMixedComplexVector) { - using value_type = gko::next_precision; + using value_type = next_precision; using MixedDense = gko::matrix::Dense; using MixedDenseComplex = gko::to_complex; using T = gko::to_complex; diff --git a/reference/test/reorder/scaled_reordered.cpp b/reference/test/reorder/scaled_reordered.cpp index 8789ded37ca..baeb1cf005d 100644 --- a/reference/test/reorder/scaled_reordered.cpp +++ b/reference/test/reorder/scaled_reordered.cpp @@ -477,7 +477,7 @@ TYPED_TEST(ScaledReordered, SolvesSingleRhsWithScalingAndRcmReorderingMixed) { using SR = typename TestFixture::SR; using T = typename TestFixture::value_type; - using Vec = gko::matrix::Dense>; + using Vec = gko::matrix::Dense>; auto scaled_reordered_fact = SR::build() .with_row_scaling(this->diag2) .with_col_scaling(this->diag3) @@ -521,7 +521,7 @@ TYPED_TEST(ScaledReordered, { using SR = typename TestFixture::SR; using T = typename TestFixture::value_type; - using value_type = gko::next_precision; + using value_type = next_precision; using Vec = gko::matrix::Dense; auto scaled_reordered_fact = SR::build() .with_row_scaling(this->diag2) diff --git a/reference/test/solver/bicg_kernels.cpp b/reference/test/solver/bicg_kernels.cpp index e317677b2de..eafba6ca123 100644 --- a/reference/test/solver/bicg_kernels.cpp +++ b/reference/test/solver/bicg_kernels.cpp @@ -307,7 +307,7 @@ TYPED_TEST(Bicg, SolvesStencilSystem) TYPED_TEST(Bicg, SolvesStencilSystemMixed) { - using value_type = gko::next_precision; + using value_type = next_precision; using Mtx = gko::matrix::Dense; auto solver = this->bicg_factory->generate(this->mtx); auto b = gko::initialize({-1.0, 3.0, 1.0}, this->exec); @@ -344,7 +344,7 @@ TYPED_TEST(Bicg, SolvesStencilSystemComplex) TYPED_TEST(Bicg, SolvesStencilSystemMixedComplex) { using value_type = - gko::to_complex>; + gko::to_complex>; using Mtx = gko::matrix::Dense; auto solver = this->bicg_factory->generate(this->mtx); auto b = gko::initialize( @@ -399,7 +399,7 @@ TYPED_TEST(Bicg, SolvesStencilSystemUsingAdvancedApply) TYPED_TEST(Bicg, SolvesStencilSystemUsingAdvancedApplyMixed) { - using value_type = gko::next_precision; + using value_type = next_precision; using Mtx = gko::matrix::Dense; auto solver = this->bicg_factory->generate(this->mtx); auto alpha = gko::initialize({2.0}, this->exec); @@ -440,8 +440,8 @@ TYPED_TEST(Bicg, SolvesStencilSystemUsingAdvancedApplyComplex) TYPED_TEST(Bicg, SolvesStencilSystemUsingAdvancedApplyMixedComplex) { - using Scalar = gko::matrix::Dense< - gko::next_precision>; + using Scalar = + gko::matrix::Dense>; using Mtx = gko::to_complex; using value_type = typename Mtx::value_type; auto solver = this->bicg_factory->generate(this->mtx); diff --git a/reference/test/solver/bicgstab_kernels.cpp b/reference/test/solver/bicgstab_kernels.cpp index ec44b6b6f17..56e11dd84bc 100644 --- a/reference/test/solver/bicgstab_kernels.cpp +++ b/reference/test/solver/bicgstab_kernels.cpp @@ -422,7 +422,7 @@ TYPED_TEST(Bicgstab, SolvesDenseSystem) TYPED_TEST(Bicgstab, SolvesDenseSystemMixed) { - using value_type = gko::next_precision; + using value_type = next_precision; using Mtx = gko::matrix::Dense; auto solver = this->bicgstab_factory->generate(this->mtx); auto b = gko::initialize({-1.0, 3.0, 1.0}, this->exec); @@ -459,7 +459,7 @@ TYPED_TEST(Bicgstab, SolvesDenseSystemComplex) TYPED_TEST(Bicgstab, SolvesDenseSystemMixedComplex) { using value_type = - gko::to_complex>; + gko::to_complex>; using Mtx = gko::matrix::Dense; auto solver = this->bicgstab_factory->generate(this->mtx); auto b = gko::initialize( @@ -534,7 +534,7 @@ TYPED_TEST(Bicgstab, SolvesDenseSystemUsingAdvancedApply) TYPED_TEST(Bicgstab, SolvesDenseSystemUsingAdvancedApplyMixed) { - using value_type = gko::next_precision; + using value_type = next_precision; using Mtx = gko::matrix::Dense; auto solver = this->bicgstab_factory->generate(this->mtx); auto alpha = gko::initialize({2.0}, this->exec); @@ -575,8 +575,8 @@ TYPED_TEST(Bicgstab, SolvesDenseSystemUsingAdvancedApplyComplex) TYPED_TEST(Bicgstab, SolvesDenseSystemUsingAdvancedApplyMixedComplex) { - using Scalar = gko::matrix::Dense< - gko::next_precision>; + using Scalar = + gko::matrix::Dense>; using Mtx = gko::to_complex; using value_type = typename Mtx::value_type; auto solver = this->bicgstab_factory->generate(this->mtx); diff --git a/reference/test/solver/cb_gmres_kernels.cpp b/reference/test/solver/cb_gmres_kernels.cpp index 1127d7caff7..0b3580163b1 100644 --- a/reference/test/solver/cb_gmres_kernels.cpp +++ b/reference/test/solver/cb_gmres_kernels.cpp @@ -203,7 +203,7 @@ TYPED_TEST(CbGmres, SolvesStencilSystem) TYPED_TEST(CbGmres, SolvesStencilSystemMixed) { - using value_type = gko::next_precision; + using value_type = next_precision; using Mtx = gko::matrix::Dense; auto solver = this->cb_gmres_factory->generate(this->mtx); auto b = gko::initialize({13.0, 7.0, 1.0}, this->exec); @@ -242,7 +242,7 @@ TYPED_TEST(CbGmres, SolvesStencilSystemComplex) TYPED_TEST(CbGmres, SolvesStencilSystemMixedComplex) { using value_type = - gko::to_complex>; + gko::to_complex>; using Mtx = gko::matrix::Dense; auto solver = this->cb_gmres_factory->generate(this->mtx); auto b = @@ -327,7 +327,7 @@ TYPED_TEST(CbGmres, SolvesStencilSystemUsingAdvancedApply) TYPED_TEST(CbGmres, SolvesStencilSystemUsingAdvancedApplyMixed) { - using value_type = gko::next_precision; + using value_type = next_precision; using Mtx = gko::matrix::Dense; auto solver = this->cb_gmres_factory->generate(this->mtx); auto alpha = gko::initialize({2.0}, this->exec); @@ -370,8 +370,8 @@ TYPED_TEST(CbGmres, SolvesStencilSystemUsingAdvancedApplyComplex) TYPED_TEST(CbGmres, SolvesStencilSystemUsingAdvancedApplyMixedComplex) { - using Scalar = gko::matrix::Dense< - gko::next_precision>; + using Scalar = + gko::matrix::Dense>; using Mtx = gko::to_complex; using value_type = typename Mtx::value_type; auto solver = this->cb_gmres_factory->generate(this->mtx); diff --git a/reference/test/solver/cg_kernels.cpp b/reference/test/solver/cg_kernels.cpp index 76b8cf55946..ffe594625ef 100644 --- a/reference/test/solver/cg_kernels.cpp +++ b/reference/test/solver/cg_kernels.cpp @@ -268,7 +268,7 @@ TYPED_TEST(Cg, SolvesStencilSystem) TYPED_TEST(Cg, SolvesStencilSystemMixed) { - using value_type = gko::next_precision; + using value_type = next_precision; using Mtx = gko::matrix::Dense; auto solver = this->cg_factory->generate(this->mtx); auto b = gko::initialize({-1.0, 3.0, 1.0}, this->exec); @@ -305,7 +305,7 @@ TYPED_TEST(Cg, SolvesStencilSystemComplex) TYPED_TEST(Cg, SolvesStencilSystemMixedComplex) { using value_type = - gko::to_complex>; + gko::to_complex>; using Mtx = gko::matrix::Dense; auto solver = this->cg_factory->generate(this->mtx); auto b = gko::initialize( @@ -360,7 +360,7 @@ TYPED_TEST(Cg, SolvesStencilSystemUsingAdvancedApply) TYPED_TEST(Cg, SolvesStencilSystemUsingAdvancedApplyMixed) { - using value_type = gko::next_precision; + using value_type = next_precision; using Mtx = gko::matrix::Dense; auto solver = this->cg_factory->generate(this->mtx); auto alpha = gko::initialize({2.0}, this->exec); @@ -401,8 +401,8 @@ TYPED_TEST(Cg, SolvesStencilSystemUsingAdvancedApplyComplex) TYPED_TEST(Cg, SolvesStencilSystemUsingAdvancedApplyMixedComplex) { - using Scalar = gko::matrix::Dense< - gko::next_precision>; + using Scalar = + gko::matrix::Dense>; using Mtx = gko::to_complex; using value_type = typename Mtx::value_type; auto solver = this->cg_factory->generate(this->mtx); diff --git a/reference/test/solver/cgs_kernels.cpp b/reference/test/solver/cgs_kernels.cpp index 9c3ce2071a7..6f9c821025d 100644 --- a/reference/test/solver/cgs_kernels.cpp +++ b/reference/test/solver/cgs_kernels.cpp @@ -332,7 +332,7 @@ TYPED_TEST(Cgs, SolvesDenseSystem) TYPED_TEST(Cgs, SolvesDenseSystemMixed) { - using value_type = gko::next_precision; + using value_type = next_precision; using Mtx = gko::matrix::Dense; auto solver = this->cgs_factory->generate(this->mtx); auto b = gko::initialize({-1.0, 3.0, 1.0}, this->exec); @@ -369,7 +369,7 @@ TYPED_TEST(Cgs, SolvesDenseSystemComplex) TYPED_TEST(Cgs, SolvesDenseSystemMixedComplex) { using value_type = - gko::to_complex>; + gko::to_complex>; using Mtx = gko::matrix::Dense; auto solver = this->cgs_factory->generate(this->mtx); auto b = gko::initialize( @@ -425,7 +425,7 @@ TYPED_TEST(Cgs, SolvesDenseSystemUsingAdvancedApply) TYPED_TEST(Cgs, SolvesDenseSystemUsingAdvancedApplyMixed) { - using value_type = gko::next_precision; + using value_type = next_precision; using Mtx = gko::matrix::Dense; auto solver = this->cgs_factory->generate(this->mtx); auto alpha = gko::initialize({2.0}, this->exec); @@ -466,8 +466,8 @@ TYPED_TEST(Cgs, SolvesDenseSystemUsingAdvancedApplyComplex) TYPED_TEST(Cgs, SolvesDenseSystemUsingAdvancedApplyMixedComplex) { - using Scalar = gko::matrix::Dense< - gko::next_precision>; + using Scalar = + gko::matrix::Dense>; using Mtx = gko::to_complex; using value_type = typename Mtx::value_type; auto solver = this->cgs_factory->generate(this->mtx); diff --git a/reference/test/solver/fcg_kernels.cpp b/reference/test/solver/fcg_kernels.cpp index e8163752689..8a3a796c60a 100644 --- a/reference/test/solver/fcg_kernels.cpp +++ b/reference/test/solver/fcg_kernels.cpp @@ -281,7 +281,7 @@ TYPED_TEST(Fcg, SolvesStencilSystem) TYPED_TEST(Fcg, SolvesStencilSystemMixed) { - using value_type = gko::next_precision; + using value_type = next_precision; using Mtx = gko::matrix::Dense; auto solver = this->fcg_factory->generate(this->mtx); auto b = gko::initialize({-1.0, 3.0, 1.0}, this->exec); @@ -318,7 +318,7 @@ TYPED_TEST(Fcg, SolvesStencilSystemComplex) TYPED_TEST(Fcg, SolvesStencilSystemMixedComplex) { using value_type = - gko::to_complex>; + gko::to_complex>; using Mtx = gko::matrix::Dense; auto solver = this->fcg_factory->generate(this->mtx); auto b = gko::initialize( @@ -373,7 +373,7 @@ TYPED_TEST(Fcg, SolvesStencilSystemUsingAdvancedApply) TYPED_TEST(Fcg, SolvesStencilSystemUsingAdvancedApplyMixed) { - using value_type = gko::next_precision; + using value_type = next_precision; using Mtx = gko::matrix::Dense; auto solver = this->fcg_factory->generate(this->mtx); auto alpha = gko::initialize({2.0}, this->exec); @@ -414,8 +414,8 @@ TYPED_TEST(Fcg, SolvesStencilSystemUsingAdvancedApplyComplex) TYPED_TEST(Fcg, SolvesStencilSystemUsingAdvancedApplyMixedComplex) { - using Scalar = gko::matrix::Dense< - gko::next_precision>; + using Scalar = + gko::matrix::Dense>; using Mtx = gko::to_complex; using value_type = typename Mtx::value_type; auto solver = this->fcg_factory->generate(this->mtx); diff --git a/reference/test/solver/gmres_kernels.cpp b/reference/test/solver/gmres_kernels.cpp index 585fec833bc..47cc8f0476a 100644 --- a/reference/test/solver/gmres_kernels.cpp +++ b/reference/test/solver/gmres_kernels.cpp @@ -422,7 +422,7 @@ TYPED_TEST(Gmres, SolvesStencilSystem) TYPED_TEST(Gmres, SolvesStencilSystemMixed) { - using value_type = gko::next_precision; + using value_type = next_precision; using Mtx = gko::matrix::Dense; auto solver = this->gmres_factory->generate(this->mtx); auto b = gko::initialize({13.0, 7.0, 1.0}, this->exec); @@ -460,7 +460,7 @@ TYPED_TEST(Gmres, SolvesStencilSystemComplex) TYPED_TEST(Gmres, SolvesStencilSystemMixedComplex) { using value_type = - gko::to_complex>; + gko::to_complex>; using Mtx = gko::matrix::Dense; auto solver = this->gmres_factory->generate(this->mtx); auto b = @@ -516,7 +516,7 @@ TYPED_TEST(Gmres, SolvesStencilSystemUsingAdvancedApply) TYPED_TEST(Gmres, SolvesStencilSystemUsingAdvancedApplyMixed) { - using value_type = gko::next_precision; + using value_type = next_precision; using Mtx = gko::matrix::Dense; auto solver = this->gmres_factory->generate(this->mtx); auto alpha = gko::initialize({2.0}, this->exec); @@ -558,8 +558,8 @@ TYPED_TEST(Gmres, SolvesStencilSystemUsingAdvancedApplyComplex) TYPED_TEST(Gmres, SolvesStencilSystemUsingAdvancedApplyMixedComplex) { - using Scalar = gko::matrix::Dense< - gko::next_precision>; + using Scalar = + gko::matrix::Dense>; using Mtx = gko::to_complex; using value_type = typename Mtx::value_type; auto solver = this->gmres_factory->generate(this->mtx); diff --git a/reference/test/solver/idr_kernels.cpp b/reference/test/solver/idr_kernels.cpp index 3e74e0c319b..a1154ccb598 100644 --- a/reference/test/solver/idr_kernels.cpp +++ b/reference/test/solver/idr_kernels.cpp @@ -114,7 +114,7 @@ TYPED_TEST(Idr, SolvesDenseSystem) TYPED_TEST(Idr, SolvesDenseSystemMixed) { - using value_type = gko::next_precision; + using value_type = next_precision; using Mtx = gko::matrix::Dense; auto solver = this->idr_factory->generate(this->mtx); auto b = gko::initialize({-1.0, 3.0, 1.0}, this->exec); @@ -151,7 +151,7 @@ TYPED_TEST(Idr, SolvesDenseSystemComplex) TYPED_TEST(Idr, SolvesDenseSystemMixedComplex) { using value_type = - gko::to_complex>; + gko::to_complex>; using Mtx = gko::matrix::Dense; auto solver = this->idr_factory->generate(this->mtx); auto b = gko::initialize( @@ -269,7 +269,7 @@ TYPED_TEST(Idr, SolvesDenseSystemUsingAdvancedApply) TYPED_TEST(Idr, SolvesDenseSystemUsingAdvancedApplyMixed) { - using value_type = gko::next_precision; + using value_type = next_precision; using Mtx = gko::matrix::Dense; auto solver = this->idr_factory->generate(this->mtx); auto alpha = gko::initialize({2.0}, this->exec); @@ -310,8 +310,8 @@ TYPED_TEST(Idr, SolvesDenseSystemUsingAdvancedApplyComplex) TYPED_TEST(Idr, SolvesDenseSystemUsingAdvancedApplyMixedComplex) { - using Scalar = gko::matrix::Dense< - gko::next_precision>; + using Scalar = + gko::matrix::Dense>; using Mtx = gko::to_complex; using value_type = typename Mtx::value_type; auto solver = this->idr_factory->generate(this->mtx); diff --git a/reference/test/solver/ir_kernels.cpp b/reference/test/solver/ir_kernels.cpp index 8b4255b72ef..0eb45e7026e 100644 --- a/reference/test/solver/ir_kernels.cpp +++ b/reference/test/solver/ir_kernels.cpp @@ -116,7 +116,7 @@ TYPED_TEST(Ir, SolvesTriangularSystem) TYPED_TEST(Ir, SolvesTriangularSystemMixed) { - using value_type = gko::next_precision; + using value_type = next_precision; using Mtx = gko::matrix::Dense; auto solver = this->ir_factory->generate(this->mtx); auto b = gko::initialize({3.9, 9.0, 2.2}, this->exec); @@ -153,7 +153,7 @@ TYPED_TEST(Ir, SolvesTriangularSystemComplex) TYPED_TEST(Ir, SolvesTriangularSystemMixedComplex) { using value_type = - gko::to_complex>; + gko::to_complex>; using Mtx = gko::matrix::Dense; auto solver = this->ir_factory->generate(this->mtx); auto b = gko::initialize( @@ -279,8 +279,8 @@ TYPED_TEST(Ir, SolvesTriangularSystemUsingAdvancedApplyComplex) TYPED_TEST(Ir, SolvesTriangularSystemUsingAdvancedApplyMixedComplex) { - using Scalar = gko::matrix::Dense< - gko::next_precision>; + using Scalar = + gko::matrix::Dense>; using Mtx = gko::to_complex; using value_type = typename Mtx::value_type; auto solver = this->ir_factory->generate(this->mtx); diff --git a/reference/test/solver/lower_trs_kernels.cpp b/reference/test/solver/lower_trs_kernels.cpp index ed3fff964e6..0a5d6d47e54 100644 --- a/reference/test/solver/lower_trs_kernels.cpp +++ b/reference/test/solver/lower_trs_kernels.cpp @@ -133,7 +133,7 @@ TYPED_TEST(LowerTrs, SolvesTriangularSystem) TYPED_TEST(LowerTrs, SolvesTriangularSystemMixed) { using other_value_type = typename TestFixture::value_type; - using value_type = gko::next_precision; + using value_type = next_precision; using Mtx = gko::matrix::Dense; std::shared_ptr b = gko::initialize({1.0, 2.0, 1.0}, this->exec); auto x = gko::initialize({0.0, 0.0, 0.0}, this->exec); @@ -171,7 +171,7 @@ TYPED_TEST(LowerTrs, SolvesTriangularSystemComplex) TYPED_TEST(LowerTrs, SolvesTriangularSystemMixedComplex) { using other_value_type = typename TestFixture::value_type; - using Scalar = gko::matrix::Dense>; + using Scalar = gko::matrix::Dense>; using Mtx = gko::to_complex; using value_type = typename Mtx::value_type; std::shared_ptr b = gko::initialize( @@ -242,7 +242,7 @@ TYPED_TEST(LowerTrs, SolvesTriangularSystemUsingAdvancedApply) TYPED_TEST(LowerTrs, SolvesTriangularSystemUsingAdvancedApplyMixed) { using other_value_type = typename TestFixture::value_type; - using value_type = gko::next_precision; + using value_type = next_precision; using Mtx = gko::matrix::Dense; auto alpha = gko::initialize({2.0}, this->exec); auto beta = gko::initialize({-1.0}, this->exec); @@ -284,7 +284,7 @@ TYPED_TEST(LowerTrs, SolvesTriangularSystemUsingAdvancedApplyComplex) TYPED_TEST(LowerTrs, SolvesTriangularSystemUsingAdvancedApplyMixedComplex) { using other_value_type = typename TestFixture::value_type; - using Scalar = gko::matrix::Dense>; + using Scalar = gko::matrix::Dense>; using Mtx = gko::to_complex; using value_type = typename Mtx::value_type; auto alpha = gko::initialize({2.0}, this->exec); diff --git a/reference/test/solver/multigrid_kernels.cpp b/reference/test/solver/multigrid_kernels.cpp index 3efb9d41c5e..3b32d2a1235 100644 --- a/reference/test/solver/multigrid_kernels.cpp +++ b/reference/test/solver/multigrid_kernels.cpp @@ -265,7 +265,7 @@ class Multigrid : public ::testing::Test { using Smoother = gko::solver::Ir; using InnerSolver = gko::preconditioner::Jacobi; using CoarsestSolver = gko::solver::Cg; - using CoarsestNextSolver = gko::solver::Cg>; + using CoarsestNextSolver = gko::solver::Cg>; using DummyRPFactory = DummyMultigridLevelWithFactory; using DummyFactory = DummyLinOpWithFactory; Multigrid() diff --git a/reference/test/solver/upper_trs_kernels.cpp b/reference/test/solver/upper_trs_kernels.cpp index 148c68bdcb3..915d7b8dd5e 100644 --- a/reference/test/solver/upper_trs_kernels.cpp +++ b/reference/test/solver/upper_trs_kernels.cpp @@ -133,7 +133,7 @@ TYPED_TEST(UpperTrs, SolvesTriangularSystem) TYPED_TEST(UpperTrs, SolvesTriangularSystemMixed) { using other_value_type = typename TestFixture::value_type; - using value_type = gko::next_precision; + using value_type = next_precision; using Mtx = gko::matrix::Dense; std::shared_ptr b = gko::initialize({4.0, 2.0, 3.0}, this->exec); auto x = gko::initialize({0.0, 0.0, 0.0}, this->exec); @@ -171,7 +171,7 @@ TYPED_TEST(UpperTrs, SolvesTriangularSystemComplex) TYPED_TEST(UpperTrs, SolvesTriangularSystemMixedComplex) { using other_value_type = typename TestFixture::value_type; - using Scalar = gko::matrix::Dense>; + using Scalar = gko::matrix::Dense>; using Mtx = gko::to_complex; using value_type = typename Mtx::value_type; std::shared_ptr b = gko::initialize( @@ -243,7 +243,7 @@ TYPED_TEST(UpperTrs, SolvesTriangularSystemUsingAdvancedApply) TYPED_TEST(UpperTrs, SolvesTriangularSystemUsingAdvancedApplyMixed) { using other_value_type = typename TestFixture::value_type; - using value_type = gko::next_precision; + using value_type = next_precision; using Mtx = gko::matrix::Dense; auto alpha = gko::initialize({2.0}, this->exec); auto beta = gko::initialize({-1.0}, this->exec); @@ -285,7 +285,7 @@ TYPED_TEST(UpperTrs, SolvesTriangularSystemUsingAdvancedApplyComplex) TYPED_TEST(UpperTrs, SolvesTriangularSystemUsingAdvancedApplyMixedComplex) { using other_value_type = typename TestFixture::value_type; - using Scalar = gko::matrix::Dense>; + using Scalar = gko::matrix::Dense>; using Mtx = gko::to_complex; using value_type = typename Mtx::value_type; auto alpha = gko::initialize({2.0}, this->exec); diff --git a/test/matrix/matrix.cpp b/test/matrix/matrix.cpp index 39c27043324..b75626daefc 100644 --- a/test/matrix/matrix.cpp +++ b/test/matrix/matrix.cpp @@ -588,7 +588,7 @@ class Matrix : public CommonTestFixture { using Mtx = typename T::matrix_type; using index_type = typename Mtx::index_type; using value_type = typename Mtx::value_type; - using mixed_value_type = gko::next_precision; + using mixed_value_type = next_precision; using Vec = gko::matrix::Dense; using MixedVec = gko::matrix::Dense; diff --git a/test/mpi/matrix.cpp b/test/mpi/matrix.cpp index 5d12ae9afa7..04b142a4132 100644 --- a/test/mpi/matrix.cpp +++ b/test/mpi/matrix.cpp @@ -438,7 +438,7 @@ TYPED_TEST(Matrix, CanConvertToNextPrecision) using csr = typename TestFixture::local_matrix_type; using local_index_type = typename TestFixture::local_index_type; using global_index_type = typename TestFixture::global_index_type; - using OtherT = typename gko::next_precision; + using OtherT = next_precision; using OtherDist = typename gko::experimental::distributed::Matrix< OtherT, local_index_type, global_index_type>; auto tmp = OtherDist::create(this->ref, this->comm); @@ -464,7 +464,7 @@ TYPED_TEST(Matrix, CanMoveToNextPrecision) using csr = typename TestFixture::local_matrix_type; using local_index_type = typename TestFixture::local_index_type; using global_index_type = typename TestFixture::global_index_type; - using OtherT = typename gko::next_precision; + using OtherT = next_precision; using OtherDist = typename gko::experimental::distributed::Matrix< OtherT, local_index_type, global_index_type>; auto tmp = OtherDist::create(this->ref, this->comm); diff --git a/test/mpi/solver/solver.cpp b/test/mpi/solver/solver.cpp index 1494ab117f2..b04cbf53b87 100644 --- a/test/mpi/solver/solver.cpp +++ b/test/mpi/solver/solver.cpp @@ -74,7 +74,7 @@ template struct SimpleSolverTest { using solver_type = SolverType; using value_type = typename solver_type::value_type; - using mixed_value_type = gko::next_precision; + using mixed_value_type = next_precision; using local_index_type = gko::int32; using global_index_type = gko::int64; using dist_matrix_type = @@ -213,7 +213,7 @@ class Solver : public CommonMpiTestFixture { using local_index_type = typename T::local_index_type; using global_index_type = typename T::global_index_type; using value_type = typename T::value_type; - using mixed_value_type = gko::next_precision; + using mixed_value_type = next_precision; using Vec = typename T::dist_vector_type; using LocalVec = typename T::non_dist_vector_type; using MixedVec = typename T::mixed_dist_vector_type; diff --git a/test/mpi/vector.cpp b/test/mpi/vector.cpp index a7ad735458c..fe3b81406c5 100644 --- a/test/mpi/vector.cpp +++ b/test/mpi/vector.cpp @@ -847,7 +847,7 @@ TYPED_TEST(VectorLocalOps, AdvancedApplyNotSupported) TYPED_TEST(VectorLocalOps, ConvertsToPrecision) { using T = typename TestFixture::value_type; - using OtherT = typename gko::next_precision; + using OtherT = next_precision; using OtherVector = typename gko::experimental::distributed::Vector; auto local_tmp = OtherVector::local_vector_type::create(this->exec); auto tmp = OtherVector::create(this->exec, this->comm); @@ -863,7 +863,7 @@ TYPED_TEST(VectorLocalOps, ConvertsToPrecision) TYPED_TEST(VectorLocalOps, MovesToPrecision) { using T = typename TestFixture::value_type; - using OtherT = typename gko::next_precision; + using OtherT = next_precision; using OtherVector = typename gko::experimental::distributed::Vector; auto local_tmp = OtherVector::local_vector_type::create(this->exec); auto tmp = OtherVector::create(this->exec, this->comm); diff --git a/test/solver/solver.cpp b/test/solver/solver.cpp index a6adb0394d3..2e09c64ba18 100644 --- a/test/solver/solver.cpp +++ b/test/solver/solver.cpp @@ -560,7 +560,7 @@ class Solver : public CommonTestFixture { using Precond = typename T::precond_type; using Mtx = typename T::matrix_type; using value_type = typename Mtx::value_type; - using mixed_value_type = gko::next_precision; + using mixed_value_type = next_precision; using Vec = gko::matrix::Dense; using MixedVec = gko::matrix::Dense; From 9625e11b851987f38abfa41ae82a2293f8760133 Mon Sep 17 00:00:00 2001 From: "Yuhsiang M. Tsai" Date: Tue, 7 Feb 2023 17:03:33 +0100 Subject: [PATCH 16/48] allow disable half operation --- CMakeLists.txt | 1 + cmake/get_info.cmake | 2 + core/base/mixed_precision_types.hpp | 178 ++++++++++----------- core/distributed/matrix.cpp | 3 +- core/distributed/vector.cpp | 7 +- core/matrix/coo.cpp | 2 + core/matrix/csr.cpp | 3 +- core/matrix/dense.cpp | 2 + core/matrix/diagonal.cpp | 3 + core/matrix/ell.cpp | 2 + core/matrix/fbcsr.cpp | 5 +- core/matrix/hybrid.cpp | 2 + core/matrix/row_gatherer.cpp | 16 +- core/matrix/sellp.cpp | 3 + core/multigrid/pgm.cpp | 1 - core/solver/multigrid.cpp | 63 +++++++- include/ginkgo/config.hpp.in | 5 + include/ginkgo/core/base/math.hpp | 11 ++ include/ginkgo/core/base/mpi.hpp | 7 +- include/ginkgo/core/base/types.hpp | 129 +++++++-------- include/ginkgo/core/distributed/matrix.hpp | 8 +- include/ginkgo/core/distributed/vector.hpp | 7 +- include/ginkgo/core/matrix/coo.hpp | 19 ++- include/ginkgo/core/matrix/csr.hpp | 10 +- include/ginkgo/core/matrix/dense.hpp | 27 ++-- include/ginkgo/core/matrix/diagonal.hpp | 14 +- include/ginkgo/core/matrix/ell.hpp | 18 ++- include/ginkgo/core/matrix/fbcsr.hpp | 18 ++- include/ginkgo/core/matrix/hybrid.hpp | 19 ++- include/ginkgo/core/matrix/sellp.hpp | 18 ++- 30 files changed, 377 insertions(+), 226 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index efb3fcc24ff..350a5f296ff 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -29,6 +29,7 @@ option(GINKGO_BUILD_DOC "Generate documentation" OFF) option(GINKGO_FAST_TESTS "Reduces the input size for a few tests known to be time-intensive" OFF) option(GINKGO_TEST_NONDEFAULT_STREAM "Uses non-default streams in CUDA and HIP tests" OFF) option(GINKGO_MIXED_PRECISION "Instantiate true mixed-precision kernels (otherwise they will be conversion-based using implicit temporary storage)" OFF) +option(GINKGO_ENABLE_HALF "Enable the half operation" OFF) option(GINKGO_SKIP_DEPENDENCY_UPDATE "Do not update dependencies each time the project is rebuilt" ON) option(GINKGO_EXPORT_BUILD_DIR diff --git a/cmake/get_info.cmake b/cmake/get_info.cmake index 2dd068abb50..97f6a2dd602 100644 --- a/cmake/get_info.cmake +++ b/cmake/get_info.cmake @@ -205,6 +205,8 @@ if(TARGET hwloc) ginkgo_print_variable(${detailed_log} "HWLOC_LIBRARIES") ginkgo_print_variable(${detailed_log} "HWLOC_INCLUDE_DIRS") endif() +ginkgo_print_variable(${minimal_log} "GINKGO_ENABLE_HALF") +ginkgo_print_variable(${detailed_log} "GINKGO_ENABLE_HALF") _minimal( " diff --git a/core/base/mixed_precision_types.hpp b/core/base/mixed_precision_types.hpp index 5aa13c2cb66..a8ba4a54e30 100644 --- a/core/base/mixed_precision_types.hpp +++ b/core/base/mixed_precision_types.hpp @@ -42,97 +42,97 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT1(_macro, ...) \ - template _macro(float, half, half, __VA_ARGS__); \ - template _macro(float, half, float, __VA_ARGS__); \ - template _macro(float, half, double, __VA_ARGS__); \ - template _macro(float, float, half, __VA_ARGS__); \ + GKO_ADAPT_HF(_macro(float, half, half, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(float, half, float, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(float, half, double, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(float, float, half, __VA_ARGS__)); \ template _macro(float, float, float, __VA_ARGS__); \ template _macro(float, float, double, __VA_ARGS__); \ - template _macro(float, double, half, __VA_ARGS__); \ + GKO_ADAPT_HF(_macro(float, double, half, __VA_ARGS__)); \ template _macro(float, double, float, __VA_ARGS__); \ template _macro(float, double, double, __VA_ARGS__) #define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT2(_macro, ...) \ - template _macro(double, half, half, __VA_ARGS__); \ - template _macro(double, half, float, __VA_ARGS__); \ - template _macro(double, half, double, __VA_ARGS__); \ - template _macro(double, float, half, __VA_ARGS__); \ + GKO_ADAPT_HF(_macro(double, half, half, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(double, half, float, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(double, half, double, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(double, float, half, __VA_ARGS__)); \ template _macro(double, float, float, __VA_ARGS__); \ template _macro(double, float, double, __VA_ARGS__); \ - template _macro(double, double, half, __VA_ARGS__); \ + GKO_ADAPT_HF(_macro(double, double, half, __VA_ARGS__)); \ template _macro(double, double, float, __VA_ARGS__); \ template _macro(double, double, double, __VA_ARGS__) #define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT3(_macro, ...) \ - GKO_ADAPT_CPHF(_macro(std::complex, std::complex, \ - std::complex, __VA_ARGS__)); \ - GKO_ADAPT_CPHF(_macro(std::complex, std::complex, \ - std::complex, __VA_ARGS__)); \ - GKO_ADAPT_CPHF(_macro(std::complex, std::complex, \ - std::complex, __VA_ARGS__)); \ - GKO_ADAPT_CPHF(_macro(std::complex, std::complex, \ - std::complex, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ template _macro(std::complex, std::complex, \ std::complex, __VA_ARGS__); \ template _macro(std::complex, std::complex, \ std::complex, __VA_ARGS__); \ - GKO_ADAPT_CPHF(_macro(std::complex, std::complex, \ - std::complex, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ template _macro(std::complex, std::complex, \ std::complex, __VA_ARGS__); \ template _macro(std::complex, std::complex, \ std::complex, __VA_ARGS__) #define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT4(_macro, ...) \ - GKO_ADAPT_CPHF(_macro(std::complex, std::complex, \ - std::complex, __VA_ARGS__)); \ - GKO_ADAPT_CPHF(_macro(std::complex, std::complex, \ - std::complex, __VA_ARGS__)); \ - GKO_ADAPT_CPHF(_macro(std::complex, std::complex, \ - std::complex, __VA_ARGS__)); \ - GKO_ADAPT_CPHF(_macro(std::complex, std::complex, \ - std::complex, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ template _macro(std::complex, std::complex, \ std::complex, __VA_ARGS__); \ template _macro(std::complex, std::complex, \ std::complex, __VA_ARGS__); \ - GKO_ADAPT_CPHF(_macro(std::complex, std::complex, \ - std::complex, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ template _macro(std::complex, std::complex, \ std::complex, __VA_ARGS__); \ template _macro(std::complex, std::complex, \ std::complex, __VA_ARGS__) #define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT5(_macro, ...) \ - template _macro(half, half, half, __VA_ARGS__); \ - template _macro(half, half, float, __VA_ARGS__); \ - template _macro(half, half, double, __VA_ARGS__); \ - template _macro(half, float, half, __VA_ARGS__); \ - template _macro(half, float, float, __VA_ARGS__); \ - template _macro(half, float, double, __VA_ARGS__); \ - template _macro(half, double, half, __VA_ARGS__); \ - template _macro(half, double, float, __VA_ARGS__); \ - template _macro(half, double, double, __VA_ARGS__) + GKO_ADAPT_HF(_macro(half, half, half, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(half, half, float, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(half, half, double, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(half, float, half, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(half, float, float, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(half, float, double, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(half, double, half, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(half, double, float, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(half, double, double, __VA_ARGS__)) #define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT6(_macro, ...) \ - GKO_ADAPT_CPHF(_macro(std::complex, std::complex, \ - std::complex, __VA_ARGS__)); \ - GKO_ADAPT_CPHF(_macro(std::complex, std::complex, \ - std::complex, __VA_ARGS__)); \ - GKO_ADAPT_CPHF(_macro(std::complex, std::complex, \ - std::complex, __VA_ARGS__)); \ - GKO_ADAPT_CPHF(_macro(std::complex, std::complex, \ - std::complex, __VA_ARGS__)); \ - GKO_ADAPT_CPHF(_macro(std::complex, std::complex, \ - std::complex, __VA_ARGS__)); \ - GKO_ADAPT_CPHF(_macro(std::complex, std::complex, \ - std::complex, __VA_ARGS__)); \ - GKO_ADAPT_CPHF(_macro(std::complex, std::complex, \ - std::complex, __VA_ARGS__)); \ - GKO_ADAPT_CPHF(_macro(std::complex, std::complex, \ - std::complex, __VA_ARGS__)); \ - GKO_ADAPT_CPHF(_macro(std::complex, std::complex, \ - std::complex, __VA_ARGS__)) + GKO_ADAPT_HF(_macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)) #else @@ -152,11 +152,11 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. std::complex, __VA_ARGS__) #define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT5(_macro, ...) \ - template _macro(half, half, half, __VA_ARGS__) + GKO_ADAPT_HF(_macro(half, half, half, __VA_ARGS__)) #define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT6(_macro, ...) \ - GKO_ADAPT_CPHF(_macro(std::complex, std::complex, \ - std::complex, __VA_ARGS__)) + GKO_ADAPT_HF(_macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)) #endif @@ -177,38 +177,36 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #ifdef GINKGO_MIXED_PRECISION -#define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_2(_macro, ...) \ - template _macro(half, half, __VA_ARGS__); \ - template _macro(half, float, __VA_ARGS__); \ - template _macro(half, double, __VA_ARGS__); \ - template _macro(float, half, __VA_ARGS__); \ - template _macro(float, float, __VA_ARGS__); \ - template _macro(float, double, __VA_ARGS__); \ - template _macro(double, half, __VA_ARGS__); \ - template _macro(double, float, __VA_ARGS__); \ - template _macro(double, double, __VA_ARGS__); \ - GKO_ADAPT_CPHF( \ - _macro(std::complex, std::complex, __VA_ARGS__)); \ - GKO_ADAPT_CPHF( \ - _macro(std::complex, std::complex, __VA_ARGS__)); \ - GKO_ADAPT_CPHF( \ - _macro(std::complex, std::complex, __VA_ARGS__)); \ - GKO_ADAPT_CPHF( \ - _macro(std::complex, std::complex, __VA_ARGS__)); \ - template _macro(std::complex, std::complex, __VA_ARGS__); \ - template _macro(std::complex, std::complex, __VA_ARGS__); \ - GKO_ADAPT_CPHF( \ - _macro(std::complex, std::complex, __VA_ARGS__)); \ - template _macro(std::complex, std::complex, __VA_ARGS__); \ +#define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_2(_macro, ...) \ + GKO_ADAPT_HF(_macro(half, half, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(half, float, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(half, double, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(float, half, __VA_ARGS__)); \ + template _macro(float, float, __VA_ARGS__); \ + template _macro(float, double, __VA_ARGS__); \ + GKO_ADAPT_HF(_macro(double, half, __VA_ARGS__)); \ + template _macro(double, float, __VA_ARGS__); \ + template _macro(double, double, __VA_ARGS__); \ + GKO_ADAPT_HF(_macro(std::complex, std::complex, __VA_ARGS__)); \ + GKO_ADAPT_HF( \ + _macro(std::complex, std::complex, __VA_ARGS__)); \ + GKO_ADAPT_HF( \ + _macro(std::complex, std::complex, __VA_ARGS__)); \ + GKO_ADAPT_HF( \ + _macro(std::complex, std::complex, __VA_ARGS__)); \ + template _macro(std::complex, std::complex, __VA_ARGS__); \ + template _macro(std::complex, std::complex, __VA_ARGS__); \ + GKO_ADAPT_HF( \ + _macro(std::complex, std::complex, __VA_ARGS__)); \ + template _macro(std::complex, std::complex, __VA_ARGS__); \ template _macro(std::complex, std::complex, __VA_ARGS__) #else -#define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_2(_macro, ...) \ - template _macro(half, half, __VA_ARGS__); \ - template _macro(float, float, __VA_ARGS__); \ - template _macro(double, double, __VA_ARGS__); \ - GKO_ADAPT_CPHF( \ - _macro(std::complex, std::complex, __VA_ARGS__)); \ - template _macro(std::complex, std::complex, __VA_ARGS__); \ +#define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_2(_macro, ...) \ + GKO_ADAPT_HF(_macro(half, half, __VA_ARGS__)); \ + template _macro(float, float, __VA_ARGS__); \ + template _macro(double, double, __VA_ARGS__); \ + GKO_ADAPT_HF(_macro(std::complex, std::complex, __VA_ARGS__)); \ + template _macro(std::complex, std::complex, __VA_ARGS__); \ template _macro(std::complex, std::complex, __VA_ARGS__) #endif diff --git a/core/distributed/matrix.cpp b/core/distributed/matrix.cpp index 8c16bbaa2cd..262418b7e85 100644 --- a/core/distributed/matrix.cpp +++ b/core/distributed/matrix.cpp @@ -139,6 +139,7 @@ void Matrix::move_to( } +#if GKO_ENABLE_HALF template void Matrix::convert_to( Matrix>, local_index_type, @@ -176,7 +177,7 @@ void Matrix::move_to( result->set_size(this->get_size()); this->set_size({}); } - +#endif template void Matrix::read_distributed( diff --git a/core/distributed/vector.cpp b/core/distributed/vector.cpp index f8f4376e217..d584b0f8dd4 100644 --- a/core/distributed/vector.cpp +++ b/core/distributed/vector.cpp @@ -290,6 +290,7 @@ void Vector::move_to(Vector>* result) } +#if GKO_ENABLE_HALF template void Vector::convert_to( Vector>>* result) const @@ -307,7 +308,7 @@ void Vector::move_to( { this->convert_to(result); } - +#endif template std::unique_ptr::absolute_type> @@ -599,8 +600,8 @@ ValueType& Vector::at_local(size_type row, size_type col) noexcept } template -ValueType Vector::at_local(size_type row, - size_type col) const noexcept +ValueType Vector::at_local(size_type row, size_type col) const + noexcept { return local_.at(row, col); } diff --git a/core/matrix/coo.cpp b/core/matrix/coo.cpp index 3438b509983..e730a1f6e7e 100644 --- a/core/matrix/coo.cpp +++ b/core/matrix/coo.cpp @@ -144,6 +144,7 @@ void Coo::move_to( } +#if GKO_ENABLE_HALF template void Coo::convert_to( Coo>, IndexType>* result) const @@ -161,6 +162,7 @@ void Coo::move_to( { this->convert_to(result); } +#endif template diff --git a/core/matrix/csr.cpp b/core/matrix/csr.cpp index 92f4665d828..b7659b0b3ac 100644 --- a/core/matrix/csr.cpp +++ b/core/matrix/csr.cpp @@ -257,7 +257,7 @@ void Csr::move_to( this->convert_to(result); } - +#if GKO_ENABLE_HALF template void Csr::convert_to( Csr>, IndexType>* result) const @@ -276,6 +276,7 @@ void Csr::move_to( { this->convert_to(result); } +#endif template diff --git a/core/matrix/dense.cpp b/core/matrix/dense.cpp index 703ae70d0b6..7ca879dd436 100644 --- a/core/matrix/dense.cpp +++ b/core/matrix/dense.cpp @@ -585,6 +585,7 @@ void Dense::move_to(Dense>* result) } +#if GKO_ENABLE_HALF template void Dense::convert_to( Dense>>* result) const @@ -607,6 +608,7 @@ void Dense::move_to( { this->convert_to(result); } +#endif template diff --git a/core/matrix/diagonal.cpp b/core/matrix/diagonal.cpp index bbc017ffb3e..2f28b9662e8 100644 --- a/core/matrix/diagonal.cpp +++ b/core/matrix/diagonal.cpp @@ -192,6 +192,8 @@ void Diagonal::move_to(Diagonal>* result) this->convert_to(result); } + +#if GKO_ENABLE_HALF template void Diagonal::convert_to( Diagonal>>* result) const @@ -207,6 +209,7 @@ void Diagonal::move_to( { this->convert_to(result); } +#endif template diff --git a/core/matrix/ell.cpp b/core/matrix/ell.cpp index cd9067ac9dc..5bc87404eeb 100644 --- a/core/matrix/ell.cpp +++ b/core/matrix/ell.cpp @@ -202,6 +202,7 @@ void Ell::move_to( } +#if GKO_ENABLE_HALF template void Ell::convert_to( Ell>, IndexType>* result) const @@ -220,6 +221,7 @@ void Ell::move_to( { this->convert_to(result); } +#endif template diff --git a/core/matrix/fbcsr.cpp b/core/matrix/fbcsr.cpp index fb57db6979c..a977c9c304b 100644 --- a/core/matrix/fbcsr.cpp +++ b/core/matrix/fbcsr.cpp @@ -198,9 +198,11 @@ void Fbcsr::move_to( } +#if GKO_ENABLE_HALF template void Fbcsr::convert_to( - Fbcsr>, IndexType>* const result) const + Fbcsr>, IndexType>* const result) + const { result->values_ = this->values_; result->col_idxs_ = this->col_idxs_; @@ -217,6 +219,7 @@ void Fbcsr::move_to( { this->convert_to(result); } +#endif template diff --git a/core/matrix/hybrid.cpp b/core/matrix/hybrid.cpp index 2ea83e5a317..712aec21e55 100644 --- a/core/matrix/hybrid.cpp +++ b/core/matrix/hybrid.cpp @@ -181,6 +181,7 @@ void Hybrid::move_to( } +#if GKO_ENABLE_HALF template void Hybrid::convert_to( Hybrid>, IndexType>* result) const @@ -200,6 +201,7 @@ void Hybrid::move_to( { this->convert_to(result); } +#endif template diff --git a/core/matrix/row_gatherer.cpp b/core/matrix/row_gatherer.cpp index f7a23206cda..ed087e69dc8 100644 --- a/core/matrix/row_gatherer.cpp +++ b/core/matrix/row_gatherer.cpp @@ -46,8 +46,14 @@ namespace matrix { template void RowGatherer::apply_impl(const LinOp* in, LinOp* out) const { - run*, const Dense*, const Dense*, + run< +#if GKO_ENABLE_HALF + const Dense*, +#endif + const Dense*, const Dense*, +#if GKO_ENABLE_HALF const Dense>*, +#endif const Dense>*, const Dense>*>( in, [&](auto gather) { gather->row_gather(&row_idxs_, out); }); } @@ -56,8 +62,14 @@ template void RowGatherer::apply_impl(const LinOp* alpha, const LinOp* in, const LinOp* beta, LinOp* out) const { - run*, const Dense*, const Dense*, + run< +#if GKO_ENABLE_HALF + const Dense*, +#endif + const Dense*, const Dense*, +#if GKO_ENABLE_HALF const Dense>*, +#endif const Dense>*, const Dense>*>( in, [&](auto gather) { gather->row_gather(alpha, &row_idxs_, beta, out); }); diff --git a/core/matrix/sellp.cpp b/core/matrix/sellp.cpp index 8f1bc6c050a..227d69e865d 100644 --- a/core/matrix/sellp.cpp +++ b/core/matrix/sellp.cpp @@ -177,6 +177,8 @@ void Sellp::move_to( this->convert_to(result); } + +#if GKO_ENABLE_HALF template void Sellp::convert_to( Sellp>, IndexType>* result) const @@ -197,6 +199,7 @@ void Sellp::move_to( { this->convert_to(result); } +#endif template diff --git a/core/multigrid/pgm.cpp b/core/multigrid/pgm.cpp index 8959a7abcd5..9df1b5d8870 100644 --- a/core/multigrid/pgm.cpp +++ b/core/multigrid/pgm.cpp @@ -237,6 +237,5 @@ void Pgm::generate() #define GKO_DECLARE_PGM(_vtype, _itype) class Pgm<_vtype, _itype> GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_PGM); - } // namespace multigrid } // namespace gko diff --git a/core/solver/multigrid.cpp b/core/solver/multigrid.cpp index 32a919013bb..9a2a6b6a6db 100644 --- a/core/solver/multigrid.cpp +++ b/core/solver/multigrid.cpp @@ -314,7 +314,14 @@ void MultigridState::generate(const LinOp* system_matrix_in, auto next_nrows = mg_level_list.at(i)->get_coarse_op()->get_size()[0]; auto mg_level = mg_level_list.at(i); - run, +#endif std::complex, std::complex>( mg_level, [&, this](auto mg_level, auto i, auto cycle, auto current_nrows, @@ -371,7 +378,14 @@ void MultigridState::run_mg_cycle(multigrid::cycle cycle, size_type level, return; } auto mg_level = multigrid->get_mg_level_list().at(level); - run, +#endif std::complex, std::complex>( mg_level, [&, this](auto mg_level) { using value_type = @@ -516,7 +530,14 @@ void Multigrid::generate() break; } - run, +#endif std::complex, std::complex>( mg_level, [this](auto mg_level, auto index, auto matrix) { @@ -554,7 +575,14 @@ void Multigrid::generate() auto last_mg_level = mg_level_list_.back(); // generate coarsest solver - run, +#endif std::complex, std::complex>( last_mg_level, [this](auto mg_level, auto level, auto matrix) { @@ -640,7 +668,14 @@ void Multigrid::apply_with_initial_guess_impl(const LinOp* b, LinOp* x, b, x); }; auto first_mg_level = this->get_mg_level_list().front(); - run, +#endif std::complex, std::complex>(first_mg_level, lambda, b, x); } @@ -679,7 +714,14 @@ void Multigrid::apply_with_initial_guess_impl(const LinOp* alpha, alpha, b, beta, x); }; auto first_mg_level = this->get_mg_level_list().front(); - run, +#endif std::complex, std::complex>(first_mg_level, lambda, alpha, b, beta, x); } @@ -744,7 +786,14 @@ void Multigrid::apply_dense_impl(const VectorType* b, VectorType* x, auto first_mg_level = this->get_mg_level_list().front(); - run, +#endif std::complex, std::complex>(first_mg_level, lambda, b, x); } diff --git a/include/ginkgo/config.hpp.in b/include/ginkgo/config.hpp.in index 720b8c8a45d..616a3269cea 100644 --- a/include/ginkgo/config.hpp.in +++ b/include/ginkgo/config.hpp.in @@ -133,6 +133,11 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GKO_HAVE_HWLOC @GINKGO_HAVE_HWLOC@ // clang-format on +/* Is half operation available ? */ +// clang-format off +#define GKO_ENABLE_HALF @GINKGO_ENABLE_HALF@ +// clang-format on + /* Do we need to use blocking communication in our SpMV? */ // clang-format off diff --git a/include/ginkgo/core/base/math.hpp b/include/ginkgo/core/base/math.hpp index aad0f9b07e3..aedfeb770b1 100644 --- a/include/ginkgo/core/base/math.hpp +++ b/include/ginkgo/core/base/math.hpp @@ -439,10 +439,12 @@ namespace detail { template struct next_precision_impl {}; +#if GKO_ENABLE_HALF template <> struct next_precision_impl { using type = float; }; +#endif template <> struct next_precision_impl { @@ -451,7 +453,11 @@ struct next_precision_impl { template <> struct next_precision_impl { +#if GKO_ENABLE_HALF using type = half; +#else + using type = float; +#endif }; @@ -563,8 +569,13 @@ using next_precision = typename detail::next_precision_impl::type; * @note Currently our lists contains only two elements, so this is the same as * next_precision. */ +#if GKO_ENABLE_HALF template using previous_precision = next_precision>; +#else +template +using previous_precision = next_precision; +#endif /** diff --git a/include/ginkgo/core/base/mpi.hpp b/include/ginkgo/core/base/mpi.hpp index 0b71bdf6f07..6304797056f 100644 --- a/include/ginkgo/core/base/mpi.hpp +++ b/include/ginkgo/core/base/mpi.hpp @@ -108,9 +108,6 @@ struct type_impl {}; GKO_REGISTER_MPI_TYPE(char, MPI_CHAR); GKO_REGISTER_MPI_TYPE(unsigned char, MPI_UNSIGNED_CHAR); GKO_REGISTER_MPI_TYPE(unsigned, MPI_UNSIGNED); -// OpenMPI 5.0 have support from MPIX_C_FLOAT16 and MPICHv3.4a1 MPIX_C_FLOAT16 -// TODO: it only works on the transferring -GKO_REGISTER_MPI_TYPE(half, MPI_UNSIGNED_SHORT); GKO_REGISTER_MPI_TYPE(int, MPI_INT); GKO_REGISTER_MPI_TYPE(unsigned short, MPI_UNSIGNED_SHORT); GKO_REGISTER_MPI_TYPE(unsigned long, MPI_UNSIGNED_LONG); @@ -120,8 +117,12 @@ GKO_REGISTER_MPI_TYPE(unsigned long long, MPI_UNSIGNED_LONG_LONG); GKO_REGISTER_MPI_TYPE(float, MPI_FLOAT); GKO_REGISTER_MPI_TYPE(double, MPI_DOUBLE); GKO_REGISTER_MPI_TYPE(long double, MPI_LONG_DOUBLE); +#if GKO_ENABLE_HALF +// OpenMPI 5.0 have support from MPIX_C_FLOAT16 and MPICHv3.4a1 MPIX_C_FLOAT16 // TODO: it only works on the transferring +GKO_REGISTER_MPI_TYPE(half, MPI_UNSIGNED_SHORT); GKO_REGISTER_MPI_TYPE(std::complex, MPI_FLOAT); +#endif // GKO_ENABLE_HALF GKO_REGISTER_MPI_TYPE(std::complex, MPI_C_FLOAT_COMPLEX); GKO_REGISTER_MPI_TYPE(std::complex, MPI_C_DOUBLE_COMPLEX); diff --git a/include/ginkgo/core/base/types.hpp b/include/ginkgo/core/base/types.hpp index 290c3a91890..fb7e8b5aed7 100644 --- a/include/ginkgo/core/base/types.hpp +++ b/include/ginkgo/core/base/types.hpp @@ -425,10 +425,13 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, _enable_macro(CudaExecutor, cuda) -#if GINKGO_COMPILE_KERNEL -#define GKO_ADAPT_CPHF(_macro) template _macro +#if GKO_ENABLE_HALF +#define GKO_ADAPT_HF(_macro) template _macro #else -#define GKO_ADAPT_CPHF(_macro) template _macro +#define GKO_ADAPT_HF(_macro) \ + static_assert(true, \ + "This assert is used to counter the false positive extra " \ + "semi-colon warnings") #endif @@ -442,13 +445,13 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, */ #if GINKGO_DPCPP_SINGLE_MODE #define GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE(_macro) \ - template _macro(half); \ + GKO_ADAPT_HF(_macro(half)); \ template _macro(float); \ template <> \ _macro(double) GKO_NOT_IMPLEMENTED #else #define GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE(_macro) \ - template _macro(half); \ + GKO_ADAPT_HF(_macro(half)); \ template _macro(float); \ template _macro(double) #endif @@ -475,7 +478,7 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, #else #define GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(_macro) \ GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE(_macro); \ - GKO_ADAPT_CPHF(_macro(std::complex)); \ + GKO_ADAPT_HF(_macro(std::complex)); \ template _macro(std::complex); \ template _macro(std::complex) #endif @@ -498,7 +501,7 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, */ #if GINKGO_DPCPP_SINGLE_MODE #define GKO_INSTANTIATE_FOR_EACH_VALUE_AND_SCALAR_TYPE(_macro) \ - template _macro(half, half); \ + GKO_ADAPT_HF(_macro(half, half)); \ template _macro(float, float); \ template <> \ _macro(double, double) GKO_NOT_IMPLEMENTED; \ @@ -509,15 +512,15 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, template <> \ _macro(std::complex, double) GKO_NOT_IMPLEMENTED; #else -#define GKO_INSTANTIATE_FOR_EACH_VALUE_AND_SCALAR_TYPE(_macro) \ - template _macro(half, half); \ - template _macro(float, float); \ - template _macro(double, double); \ - GKO_ADAPT_CPHF(_macro(std::complex, std::complex)); \ - template _macro(std::complex, std::complex); \ - template _macro(std::complex, std::complex); \ - GKO_ADAPT_CPHF(_macro(std::complex, half)); \ - template _macro(std::complex, float); \ +#define GKO_INSTANTIATE_FOR_EACH_VALUE_AND_SCALAR_TYPE(_macro) \ + GKO_ADAPT_HF(_macro(half, half)); \ + template _macro(float, float); \ + template _macro(double, double); \ + GKO_ADAPT_HF(_macro(std::complex, std::complex)); \ + template _macro(std::complex, std::complex); \ + template _macro(std::complex, std::complex); \ + GKO_ADAPT_HF(_macro(std::complex, half)); \ + template _macro(std::complex, float); \ template _macro(std::complex, double) #endif @@ -546,20 +549,20 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, */ #if GINKGO_DPCPP_SINGLE_MODE #define GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_AND_INDEX_TYPE(_macro) \ - template _macro(half, int32); \ + GKO_ADAPT_HF(_macro(half, int32)); \ template _macro(float, int32); \ template <> \ _macro(double, int32) GKO_NOT_IMPLEMENTED; \ - template _macro(half, int64); \ + GKO_ADAPT_HF(_macro(half, int64)); \ template _macro(float, int64); \ template <> \ _macro(double, int64) GKO_NOT_IMPLEMENTED #else #define GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_AND_INDEX_TYPE(_macro) \ - template _macro(half, int32); \ + GKO_ADAPT_HF(_macro(half, int32)); \ template _macro(float, int32); \ template _macro(double, int32); \ - template _macro(half, int64); \ + GKO_ADAPT_HF(_macro(half, int64)); \ template _macro(float, int64); \ template _macro(double, int64) #endif @@ -585,10 +588,10 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, #else #define GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(_macro) \ GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_AND_INDEX_TYPE(_macro); \ - GKO_ADAPT_CPHF(_macro(std::complex, int32)); \ + GKO_ADAPT_HF(_macro(std::complex, int32)); \ template _macro(std::complex, int32); \ template _macro(std::complex, int32); \ - GKO_ADAPT_CPHF(_macro(std::complex, int64)); \ + GKO_ADAPT_HF(_macro(std::complex, int64)); \ template _macro(std::complex, int64); \ template _macro(std::complex, int64) #endif @@ -606,9 +609,9 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, #if GINKGO_DPCPP_SINGLE_MODE #define GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE( \ _macro) \ - template _macro(half, int32, int32); \ - template _macro(half, int32, int64); \ - template _macro(half, int64, int64); \ + GKO_ADAPT_HF(_macro(half, int32, int32)); \ + GKO_ADAPT_HF(_macro(half, int32, int64)); \ + GKO_ADAPT_HF(_macro(half, int64, int64)); \ template _macro(float, int32, int32); \ template _macro(float, int32, int64); \ template _macro(float, int64, int64); \ @@ -621,9 +624,9 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, #else #define GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE( \ _macro) \ - template _macro(half, int32, int32); \ - template _macro(half, int32, int64); \ - template _macro(half, int64, int64); \ + GKO_ADAPT_HF(_macro(half, int32, int32)); \ + GKO_ADAPT_HF(_macro(half, int32, int64)); \ + GKO_ADAPT_HF(_macro(half, int64, int64)); \ template _macro(float, int32, int32); \ template _macro(float, int32, int64); \ template _macro(float, int64, int64); \ @@ -658,9 +661,9 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, #define GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(_macro) \ GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE( \ _macro); \ - GKO_ADAPT_CPHF(_macro(std::complex, int32, int32)); \ - GKO_ADAPT_CPHF(_macro(std::complex, int32, int64)); \ - GKO_ADAPT_CPHF(_macro(std::complex, int64, int64)); \ + GKO_ADAPT_HF(_macro(std::complex, int32, int32)); \ + GKO_ADAPT_HF(_macro(std::complex, int32, int64)); \ + GKO_ADAPT_HF(_macro(std::complex, int64, int64)); \ template _macro(std::complex, int32, int32); \ template _macro(std::complex, int32, int64); \ template _macro(std::complex, int64, int64); \ @@ -700,18 +703,18 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, * Should take two arguments `src` and `dst`, which * are replaced by the source and destination value type. */ -#define GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION(_macro) \ - template _macro(float, double); \ - template _macro(double, float); \ - template _macro(half, double); \ - template _macro(double, half); \ - template _macro(float, half); \ - template _macro(half, float); \ - GKO_ADAPT_CPHF(_macro(std::complex, std::complex)); \ - GKO_ADAPT_CPHF(_macro(std::complex, std::complex)); \ - GKO_ADAPT_CPHF(_macro(std::complex, std::complex)); \ - GKO_ADAPT_CPHF(_macro(std::complex, std::complex)); \ - template _macro(std::complex, std::complex); \ +#define GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION(_macro) \ + template _macro(float, double); \ + template _macro(double, float); \ + GKO_ADAPT_HF(_macro(half, double)); \ + GKO_ADAPT_HF(_macro(double, half)); \ + GKO_ADAPT_HF(_macro(float, half)); \ + GKO_ADAPT_HF(_macro(half, float)); \ + GKO_ADAPT_HF(_macro(std::complex, std::complex)); \ + GKO_ADAPT_HF(_macro(std::complex, std::complex)); \ + GKO_ADAPT_HF(_macro(std::complex, std::complex)); \ + GKO_ADAPT_HF(_macro(std::complex, std::complex)); \ + template _macro(std::complex, std::complex); \ template _macro(std::complex, std::complex) @@ -724,13 +727,13 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, * Should take two arguments `src` and `dst`, which * are replaced by the source and destination value type. */ -#define GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION_OR_COPY(_macro) \ - GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION(_macro); \ - template _macro(half, half); \ - template _macro(float, float); \ - template _macro(double, double); \ - GKO_ADAPT_CPHF(_macro(std::complex, std::complex)); \ - template _macro(std::complex, std::complex); \ +#define GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION_OR_COPY(_macro) \ + GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION(_macro); \ + GKO_ADAPT_HF(_macro(half, half)); \ + template _macro(float, float); \ + template _macro(double, double); \ + GKO_ADAPT_HF(_macro(std::complex, std::complex)); \ + template _macro(std::complex, std::complex); \ template _macro(std::complex, std::complex) #endif @@ -743,15 +746,15 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, * Should take two arguments, which are replaced by the * value and index types. */ -#define GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_PAIR(_macro) \ - template _macro(half, half); \ - template _macro(float, float); \ - template _macro(double, double); \ - GKO_ADAPT_CPHF(_macro(std::complex, half)); \ - template _macro(std::complex, float); \ - template _macro(std::complex, double); \ - GKO_ADAPT_CPHF(_macro(std::complex, std::complex)); \ - template _macro(std::complex, std::complex); \ +#define GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_PAIR(_macro) \ + GKO_ADAPT_HF(_macro(half, half)); \ + template _macro(float, float); \ + template _macro(double, double); \ + GKO_ADAPT_HF(_macro(std::complex, half)); \ + template _macro(std::complex, float); \ + template _macro(std::complex, double); \ + GKO_ADAPT_HF(_macro(std::complex, std::complex)); \ + template _macro(std::complex, std::complex); \ template _macro(std::complex, std::complex) @@ -770,11 +773,11 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, template _macro(int64, int64); \ template _macro(unsigned int, unsigned int); \ template _macro(unsigned long, unsigned long); \ - template _macro(half, half); \ + GKO_ADAPT_HF(_macro(half, half)); \ template _macro(float, float); \ template _macro(double, double); \ template _macro(long double, long double); \ - GKO_ADAPT_CPHF(_macro(std::complex, std::complex)); \ + GKO_ADAPT_HF(_macro(std::complex, std::complex)); \ template _macro(std::complex, std::complex); \ template _macro(std::complex, std::complex) @@ -787,10 +790,10 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, * value and index types. */ #define GKO_INSTANTIATE_FOR_EACH_POD_TYPE(_macro) \ - template _macro(half); \ + GKO_ADAPT_HF(_macro(half)); \ template _macro(float); \ template _macro(double); \ - GKO_ADAPT_CPHF(_macro(std::complex)); \ + GKO_ADAPT_HF(_macro(std::complex)); \ template _macro(std::complex); \ template _macro(std::complex); \ template _macro(size_type); \ diff --git a/include/ginkgo/core/distributed/matrix.hpp b/include/ginkgo/core/distributed/matrix.hpp index b3fca57f341..10965ae4146 100644 --- a/include/ginkgo/core/distributed/matrix.hpp +++ b/include/ginkgo/core/distributed/matrix.hpp @@ -268,15 +268,15 @@ class Matrix Matrix>, public ConvertibleTo< Matrix, LocalIndexType, GlobalIndexType>>, +#if GKO_ENABLE_HALF public ConvertibleTo>, LocalIndexType, GlobalIndexType>>, +#endif public DistributedBase { friend class EnableCreateMethod; friend class EnableDistributedPolymorphicObject; friend class Matrix, LocalIndexType, GlobalIndexType>; - friend class Matrix>, - LocalIndexType, GlobalIndexType>; public: using value_type = ValueType; @@ -299,6 +299,9 @@ class Matrix void move_to(Matrix, local_index_type, global_index_type>* result) override; +#if GKO_ENABLE_HALF + friend class Matrix>, + LocalIndexType, GlobalIndexType>; void convert_to( Matrix>, local_index_type, @@ -307,6 +310,7 @@ class Matrix void move_to(Matrix>, local_index_type, global_index_type>* result) override; +#endif /** * Reads a square matrix from the device_matrix_data structure and a global * partition. diff --git a/include/ginkgo/core/distributed/vector.hpp b/include/ginkgo/core/distributed/vector.hpp index 2547e2da2c9..b4657582ae5 100644 --- a/include/ginkgo/core/distributed/vector.hpp +++ b/include/ginkgo/core/distributed/vector.hpp @@ -88,7 +88,9 @@ class Vector : public EnableDistributedLinOp>, public EnableCreateMethod>, public ConvertibleTo>>, +#if GKO_ENABLE_HALF public ConvertibleTo>>>, +#endif public EnableAbsoluteComputation>>, public DistributedBase { friend class EnableCreateMethod; @@ -96,7 +98,6 @@ class Vector friend class Vector>; friend class Vector>; friend class Vector>; - friend class Vector>>; public: using EnableDistributedLinOp::convert_to; @@ -195,11 +196,15 @@ class Vector void move_to(Vector>* result) override; +#if GKO_ENABLE_HALF + friend class Vector>>; + void convert_to(Vector>>* result) const override; void move_to( Vector>>* result) override; +#endif std::unique_ptr compute_absolute() const override; diff --git a/include/ginkgo/core/matrix/coo.hpp b/include/ginkgo/core/matrix/coo.hpp index af68b66679d..f12c637b4ba 100644 --- a/include/ginkgo/core/matrix/coo.hpp +++ b/include/ginkgo/core/matrix/coo.hpp @@ -78,7 +78,10 @@ template class Coo : public EnableLinOp>, public EnableCreateMethod>, public ConvertibleTo, IndexType>>, - public ConvertibleTo>, IndexType>>, +#if GKO_ENABLE_HALF + public ConvertibleTo< + Coo>, IndexType>>, +#endif public ConvertibleTo>, public ConvertibleTo>, public DiagonalExtractable, @@ -113,17 +116,21 @@ class Coo : public EnableLinOp>, friend class Coo, IndexType>; - friend class Coo>, IndexType>; - void convert_to( Coo, IndexType>* result) const override; void move_to(Coo, IndexType>* result) override; - void convert_to( - Coo>, IndexType>* result) const override; +#if GKO_ENABLE_HALF + friend class Coo>, + IndexType>; + + void convert_to(Coo>, IndexType>* + result) const override; - void move_to(Coo>, IndexType>* result) override; + void move_to(Coo>, IndexType>* + result) override; +#endif void convert_to(Csr* other) const override; diff --git a/include/ginkgo/core/matrix/csr.hpp b/include/ginkgo/core/matrix/csr.hpp index 610859665fa..4b46bb09f05 100644 --- a/include/ginkgo/core/matrix/csr.hpp +++ b/include/ginkgo/core/matrix/csr.hpp @@ -127,8 +127,10 @@ template class Csr : public EnableLinOp>, public EnableCreateMethod>, public ConvertibleTo, IndexType>>, +#if GKO_ENABLE_HALF public ConvertibleTo< Csr>, IndexType>>, +#endif public ConvertibleTo>, public ConvertibleTo>, public ConvertibleTo>, @@ -720,19 +722,21 @@ class Csr : public EnableLinOp>, friend class Csr, IndexType>; - friend class Csr>, - IndexType>; - void convert_to( Csr, IndexType>* result) const override; void move_to(Csr, IndexType>* result) override; +#if GKO_ENABLE_HALF + friend class Csr>, + IndexType>; + void convert_to(Csr>, IndexType>* result) const override; void move_to(Csr>, IndexType>* result) override; +#endif void convert_to(Dense* other) const override; diff --git a/include/ginkgo/core/matrix/dense.hpp b/include/ginkgo/core/matrix/dense.hpp index 8c05a5bf61a..464406a6d95 100644 --- a/include/ginkgo/core/matrix/dense.hpp +++ b/include/ginkgo/core/matrix/dense.hpp @@ -88,17 +88,6 @@ template class SparsityCsr; -class Empty {}; - -template -using next2_type = next_precision>; - - -// template -// using conditional_type = typename std::conditional< -// std::is_same>::value, Empty, -// Dense>>::type; - /** * Dense is a matrix format which explicitly stores all values of the matrix. * @@ -119,7 +108,9 @@ class Dense : public EnableLinOp>, public EnableCreateMethod>, public ConvertibleTo>>, +#if GKO_ENABLE_HALF public ConvertibleTo>>>, +#endif public ConvertibleTo>, public ConvertibleTo>, public ConvertibleTo>, @@ -308,19 +299,21 @@ class Dense return other->create_const_view_of_impl(); } - friend class Dense>; + friend class Dense>; - friend class Dense>>; + void convert_to(Dense>* result) const override; + + void move_to(Dense>* result) override; + +#if GKO_ENABLE_HALF + friend class Dense>>; void convert_to(Dense>>* result) const override; void move_to( Dense>>* result) override; - - void convert_to(Dense>* result) const override; - - void move_to(Dense>* result) override; +#endif void convert_to(Coo* result) const override; diff --git a/include/ginkgo/core/matrix/diagonal.hpp b/include/ginkgo/core/matrix/diagonal.hpp index 414b9b40d43..e9bf45349f7 100644 --- a/include/ginkgo/core/matrix/diagonal.hpp +++ b/include/ginkgo/core/matrix/diagonal.hpp @@ -71,7 +71,9 @@ class Diagonal public ConvertibleTo>, public ConvertibleTo>, public ConvertibleTo>>, +#if GKO_ENABLE_HALF public ConvertibleTo>>>, +#endif public Transposable, public WritableToMatrixData, public WritableToMatrixData, @@ -104,8 +106,6 @@ class Diagonal friend class Diagonal>; - friend class Diagonal>>; - std::unique_ptr transpose() const override; std::unique_ptr conj_transpose() const override; @@ -114,9 +114,15 @@ class Diagonal void move_to(Diagonal>* result) override; - void convert_to(Diagonal>>* result) const override; +#if GKO_ENABLE_HALF + friend class Diagonal>>; + + void convert_to(Diagonal>>* result) + const override; - void move_to(Diagonal>>* result) override; + void move_to( + Diagonal>>* result) override; +#endif void convert_to(Csr* result) const override; diff --git a/include/ginkgo/core/matrix/ell.hpp b/include/ginkgo/core/matrix/ell.hpp index 12429084a1b..d5bdf29b760 100644 --- a/include/ginkgo/core/matrix/ell.hpp +++ b/include/ginkgo/core/matrix/ell.hpp @@ -80,7 +80,10 @@ template class Ell : public EnableLinOp>, public EnableCreateMethod>, public ConvertibleTo, IndexType>>, - public ConvertibleTo>, IndexType>>, +#if GKO_ENABLE_HALF + public ConvertibleTo< + Ell>, IndexType>>, +#endif public ConvertibleTo>, public ConvertibleTo>, public DiagonalExtractable, @@ -95,7 +98,6 @@ class Ell : public EnableLinOp>, friend class Csr; friend class Ell, IndexType>; friend class Ell, IndexType>; - friend class Ell>, IndexType>; friend class Hybrid; public: @@ -120,10 +122,16 @@ class Ell : public EnableLinOp>, void move_to(Ell, IndexType>* result) override; - void convert_to( - Ell>, IndexType>* result) const override; +#if GKO_ENABLE_HALF + friend class Ell>, + IndexType>; + + void convert_to(Ell>, IndexType>* + result) const override; - void move_to(Ell>, IndexType>* result) override; + void move_to(Ell>, IndexType>* + result) override; +#endif void convert_to(Dense* other) const override; diff --git a/include/ginkgo/core/matrix/fbcsr.hpp b/include/ginkgo/core/matrix/fbcsr.hpp index d85dc6c1e0a..f66ea976084 100644 --- a/include/ginkgo/core/matrix/fbcsr.hpp +++ b/include/ginkgo/core/matrix/fbcsr.hpp @@ -127,7 +127,10 @@ template class Fbcsr : public EnableLinOp>, public EnableCreateMethod>, public ConvertibleTo, IndexType>>, - public ConvertibleTo>, IndexType>>, +#if GKO_ENABLE_HALF + public ConvertibleTo< + Fbcsr>, IndexType>>, +#endif public ConvertibleTo>, public ConvertibleTo>, public ConvertibleTo>, @@ -177,17 +180,22 @@ class Fbcsr : public EnableLinOp>, using ConvertibleTo>::move_to; friend class Fbcsr, IndexType>; - friend class Fbcsr>, IndexType>; void convert_to( Fbcsr, IndexType>* result) const override; void move_to(Fbcsr, IndexType>* result) override; - void convert_to( - Fbcsr>, IndexType>* result) const override; +#if GKO_ENABLE_HALF + friend class Fbcsr>, + IndexType>; + + void convert_to(Fbcsr>, IndexType>* + result) const override; - void move_to(Fbcsr>, IndexType>* result) override; + void move_to(Fbcsr>, IndexType>* + result) override; +#endif void convert_to(Dense* other) const override; diff --git a/include/ginkgo/core/matrix/hybrid.hpp b/include/ginkgo/core/matrix/hybrid.hpp index 600a165d7fe..700da8fb8f9 100644 --- a/include/ginkgo/core/matrix/hybrid.hpp +++ b/include/ginkgo/core/matrix/hybrid.hpp @@ -72,7 +72,10 @@ class Hybrid : public EnableLinOp>, public EnableCreateMethod>, public ConvertibleTo, IndexType>>, - public ConvertibleTo>, IndexType>>, +#if GKO_ENABLE_HALF + public ConvertibleTo< + Hybrid>, IndexType>>, +#endif public ConvertibleTo>, public ConvertibleTo>, public DiagonalExtractable, @@ -389,17 +392,21 @@ class Hybrid friend class Hybrid, IndexType>; - friend class Hybrid>, IndexType>; - void convert_to( Hybrid, IndexType>* result) const override; void move_to(Hybrid, IndexType>* result) override; - void convert_to( - Hybrid>, IndexType>* result) const override; +#if GKO_ENABLE_HALF + friend class Hybrid>, + IndexType>; + + void convert_to(Hybrid>, + IndexType>* result) const override; - void move_to(Hybrid>, IndexType>* result) override; + void move_to(Hybrid>, IndexType>* + result) override; +#endif void convert_to(Dense* other) const override; diff --git a/include/ginkgo/core/matrix/sellp.hpp b/include/ginkgo/core/matrix/sellp.hpp index c26685eab9a..1866d75eb2d 100644 --- a/include/ginkgo/core/matrix/sellp.hpp +++ b/include/ginkgo/core/matrix/sellp.hpp @@ -71,7 +71,10 @@ template class Sellp : public EnableLinOp>, public EnableCreateMethod>, public ConvertibleTo, IndexType>>, - public ConvertibleTo>, IndexType>>, +#if GKO_ENABLE_HALF + public ConvertibleTo< + Sellp>, IndexType>>, +#endif public ConvertibleTo>, public ConvertibleTo>, public DiagonalExtractable, @@ -104,17 +107,22 @@ class Sellp : public EnableLinOp>, using absolute_type = remove_complex; friend class Sellp, IndexType>; - friend class Sellp>, IndexType>; void convert_to( Sellp, IndexType>* result) const override; void move_to(Sellp, IndexType>* result) override; - void convert_to( - Sellp>, IndexType>* result) const override; +#if GKO_ENABLE_HALF + friend class Sellp>, + IndexType>; + + void convert_to(Sellp>, IndexType>* + result) const override; - void move_to(Sellp>, IndexType>* result) override; + void move_to(Sellp>, IndexType>* + result) override; +#endif void convert_to(Dense* other) const override; From 620aef5e042318b4ce2ed5cf8334ce29088af705 Mon Sep 17 00:00:00 2001 From: "Yuhsiang M. Tsai" Date: Wed, 8 Feb 2023 17:28:40 +0100 Subject: [PATCH 17/48] fix macro --- core/distributed/matrix.cpp | 2 +- core/distributed/vector.cpp | 2 +- core/matrix/coo.cpp | 2 +- core/matrix/csr.cpp | 2 +- core/matrix/dense.cpp | 2 +- core/matrix/diagonal.cpp | 2 +- core/matrix/ell.cpp | 2 +- core/matrix/fbcsr.cpp | 2 +- core/matrix/hybrid.cpp | 2 +- core/matrix/row_gatherer.cpp | 8 ++++---- core/matrix/sellp.cpp | 2 +- core/solver/multigrid.cpp | 24 +++++++++++----------- include/ginkgo/config.hpp.in | 2 +- include/ginkgo/core/base/math.hpp | 6 +++--- include/ginkgo/core/base/mpi.hpp | 2 +- include/ginkgo/core/base/types.hpp | 5 ++++- include/ginkgo/core/distributed/matrix.hpp | 4 ++-- include/ginkgo/core/distributed/vector.hpp | 11 ++++++---- include/ginkgo/core/matrix/coo.hpp | 4 ++-- include/ginkgo/core/matrix/csr.hpp | 4 ++-- include/ginkgo/core/matrix/dense.hpp | 4 ++-- include/ginkgo/core/matrix/diagonal.hpp | 4 ++-- include/ginkgo/core/matrix/ell.hpp | 4 ++-- include/ginkgo/core/matrix/fbcsr.hpp | 4 ++-- include/ginkgo/core/matrix/hybrid.hpp | 4 ++-- include/ginkgo/core/matrix/sellp.hpp | 4 ++-- 26 files changed, 60 insertions(+), 54 deletions(-) diff --git a/core/distributed/matrix.cpp b/core/distributed/matrix.cpp index 262418b7e85..2325047cc78 100644 --- a/core/distributed/matrix.cpp +++ b/core/distributed/matrix.cpp @@ -139,7 +139,7 @@ void Matrix::move_to( } -#if GKO_ENABLE_HALF +#if GINKGO_ENABLE_HALF template void Matrix::convert_to( Matrix>, local_index_type, diff --git a/core/distributed/vector.cpp b/core/distributed/vector.cpp index d584b0f8dd4..81b9c96bcfc 100644 --- a/core/distributed/vector.cpp +++ b/core/distributed/vector.cpp @@ -290,7 +290,7 @@ void Vector::move_to(Vector>* result) } -#if GKO_ENABLE_HALF +#if GINKGO_ENABLE_HALF template void Vector::convert_to( Vector>>* result) const diff --git a/core/matrix/coo.cpp b/core/matrix/coo.cpp index e730a1f6e7e..104802775ec 100644 --- a/core/matrix/coo.cpp +++ b/core/matrix/coo.cpp @@ -144,7 +144,7 @@ void Coo::move_to( } -#if GKO_ENABLE_HALF +#if GINKGO_ENABLE_HALF template void Coo::convert_to( Coo>, IndexType>* result) const diff --git a/core/matrix/csr.cpp b/core/matrix/csr.cpp index b7659b0b3ac..24ba1c2aebf 100644 --- a/core/matrix/csr.cpp +++ b/core/matrix/csr.cpp @@ -257,7 +257,7 @@ void Csr::move_to( this->convert_to(result); } -#if GKO_ENABLE_HALF +#if GINKGO_ENABLE_HALF template void Csr::convert_to( Csr>, IndexType>* result) const diff --git a/core/matrix/dense.cpp b/core/matrix/dense.cpp index 7ca879dd436..5ea55ced906 100644 --- a/core/matrix/dense.cpp +++ b/core/matrix/dense.cpp @@ -585,7 +585,7 @@ void Dense::move_to(Dense>* result) } -#if GKO_ENABLE_HALF +#if GINKGO_ENABLE_HALF template void Dense::convert_to( Dense>>* result) const diff --git a/core/matrix/diagonal.cpp b/core/matrix/diagonal.cpp index 2f28b9662e8..0d4540a615b 100644 --- a/core/matrix/diagonal.cpp +++ b/core/matrix/diagonal.cpp @@ -193,7 +193,7 @@ void Diagonal::move_to(Diagonal>* result) } -#if GKO_ENABLE_HALF +#if GINKGO_ENABLE_HALF template void Diagonal::convert_to( Diagonal>>* result) const diff --git a/core/matrix/ell.cpp b/core/matrix/ell.cpp index 5bc87404eeb..9f37d0a85f7 100644 --- a/core/matrix/ell.cpp +++ b/core/matrix/ell.cpp @@ -202,7 +202,7 @@ void Ell::move_to( } -#if GKO_ENABLE_HALF +#if GINKGO_ENABLE_HALF template void Ell::convert_to( Ell>, IndexType>* result) const diff --git a/core/matrix/fbcsr.cpp b/core/matrix/fbcsr.cpp index a977c9c304b..14b9c226bc8 100644 --- a/core/matrix/fbcsr.cpp +++ b/core/matrix/fbcsr.cpp @@ -198,7 +198,7 @@ void Fbcsr::move_to( } -#if GKO_ENABLE_HALF +#if GINKGO_ENABLE_HALF template void Fbcsr::convert_to( Fbcsr>, IndexType>* const result) diff --git a/core/matrix/hybrid.cpp b/core/matrix/hybrid.cpp index 712aec21e55..d6c802cfa01 100644 --- a/core/matrix/hybrid.cpp +++ b/core/matrix/hybrid.cpp @@ -181,7 +181,7 @@ void Hybrid::move_to( } -#if GKO_ENABLE_HALF +#if GINKGO_ENABLE_HALF template void Hybrid::convert_to( Hybrid>, IndexType>* result) const diff --git a/core/matrix/row_gatherer.cpp b/core/matrix/row_gatherer.cpp index ed087e69dc8..f2ec59da2e5 100644 --- a/core/matrix/row_gatherer.cpp +++ b/core/matrix/row_gatherer.cpp @@ -47,11 +47,11 @@ template void RowGatherer::apply_impl(const LinOp* in, LinOp* out) const { run< -#if GKO_ENABLE_HALF +#if GINKGO_ENABLE_HALF const Dense*, #endif const Dense*, const Dense*, -#if GKO_ENABLE_HALF +#if GINKGO_ENABLE_HALF const Dense>*, #endif const Dense>*, const Dense>*>( @@ -63,11 +63,11 @@ void RowGatherer::apply_impl(const LinOp* alpha, const LinOp* in, const LinOp* beta, LinOp* out) const { run< -#if GKO_ENABLE_HALF +#if GINKGO_ENABLE_HALF const Dense*, #endif const Dense*, const Dense*, -#if GKO_ENABLE_HALF +#if GINKGO_ENABLE_HALF const Dense>*, #endif const Dense>*, const Dense>*>( diff --git a/core/matrix/sellp.cpp b/core/matrix/sellp.cpp index 227d69e865d..4f36fb1b6c1 100644 --- a/core/matrix/sellp.cpp +++ b/core/matrix/sellp.cpp @@ -178,7 +178,7 @@ void Sellp::move_to( } -#if GKO_ENABLE_HALF +#if GINKGO_ENABLE_HALF template void Sellp::convert_to( Sellp>, IndexType>* result) const diff --git a/core/solver/multigrid.cpp b/core/solver/multigrid.cpp index 9a2a6b6a6db..57c04b8e95e 100644 --- a/core/solver/multigrid.cpp +++ b/core/solver/multigrid.cpp @@ -315,11 +315,11 @@ void MultigridState::generate(const LinOp* system_matrix_in, auto mg_level = mg_level_list.at(i); run, #endif std::complex, std::complex>( @@ -379,11 +379,11 @@ void MultigridState::run_mg_cycle(multigrid::cycle cycle, size_type level, } auto mg_level = multigrid->get_mg_level_list().at(level); run, #endif std::complex, std::complex>( @@ -531,11 +531,11 @@ void Multigrid::generate() } run, #endif std::complex, std::complex>( @@ -576,11 +576,11 @@ void Multigrid::generate() // generate coarsest solver run, #endif std::complex, std::complex>( @@ -669,11 +669,11 @@ void Multigrid::apply_with_initial_guess_impl(const LinOp* b, LinOp* x, }; auto first_mg_level = this->get_mg_level_list().front(); run, #endif std::complex, std::complex>(first_mg_level, lambda, b, @@ -715,11 +715,11 @@ void Multigrid::apply_with_initial_guess_impl(const LinOp* alpha, }; auto first_mg_level = this->get_mg_level_list().front(); run, #endif std::complex, std::complex>(first_mg_level, lambda, diff --git a/include/ginkgo/config.hpp.in b/include/ginkgo/config.hpp.in index 616a3269cea..ca2aa30e6f0 100644 --- a/include/ginkgo/config.hpp.in +++ b/include/ginkgo/config.hpp.in @@ -135,7 +135,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /* Is half operation available ? */ // clang-format off -#define GKO_ENABLE_HALF @GINKGO_ENABLE_HALF@ +#cmakedefine01 GINKGO_ENABLE_HALF // clang-format on diff --git a/include/ginkgo/core/base/math.hpp b/include/ginkgo/core/base/math.hpp index aedfeb770b1..75d0de9a511 100644 --- a/include/ginkgo/core/base/math.hpp +++ b/include/ginkgo/core/base/math.hpp @@ -439,7 +439,7 @@ namespace detail { template struct next_precision_impl {}; -#if GKO_ENABLE_HALF +#if GINKGO_ENABLE_HALF template <> struct next_precision_impl { using type = float; @@ -453,7 +453,7 @@ struct next_precision_impl { template <> struct next_precision_impl { -#if GKO_ENABLE_HALF +#if GINKGO_ENABLE_HALF using type = half; #else using type = float; @@ -569,7 +569,7 @@ using next_precision = typename detail::next_precision_impl::type; * @note Currently our lists contains only two elements, so this is the same as * next_precision. */ -#if GKO_ENABLE_HALF +#if GINKGO_ENABLE_HALF template using previous_precision = next_precision>; #else diff --git a/include/ginkgo/core/base/mpi.hpp b/include/ginkgo/core/base/mpi.hpp index 6304797056f..42653015725 100644 --- a/include/ginkgo/core/base/mpi.hpp +++ b/include/ginkgo/core/base/mpi.hpp @@ -117,7 +117,7 @@ GKO_REGISTER_MPI_TYPE(unsigned long long, MPI_UNSIGNED_LONG_LONG); GKO_REGISTER_MPI_TYPE(float, MPI_FLOAT); GKO_REGISTER_MPI_TYPE(double, MPI_DOUBLE); GKO_REGISTER_MPI_TYPE(long double, MPI_LONG_DOUBLE); -#if GKO_ENABLE_HALF +#if GINKGO_ENABLE_HALF // OpenMPI 5.0 have support from MPIX_C_FLOAT16 and MPICHv3.4a1 MPIX_C_FLOAT16 // TODO: it only works on the transferring GKO_REGISTER_MPI_TYPE(half, MPI_UNSIGNED_SHORT); diff --git a/include/ginkgo/core/base/types.hpp b/include/ginkgo/core/base/types.hpp index fb7e8b5aed7..3ad8d6684d4 100644 --- a/include/ginkgo/core/base/types.hpp +++ b/include/ginkgo/core/base/types.hpp @@ -46,6 +46,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include + + #ifdef __HIPCC__ #include #endif // __HIPCC__ @@ -425,7 +428,7 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, _enable_macro(CudaExecutor, cuda) -#if GKO_ENABLE_HALF +#if GINKGO_ENABLE_HALF #define GKO_ADAPT_HF(_macro) template _macro #else #define GKO_ADAPT_HF(_macro) \ diff --git a/include/ginkgo/core/distributed/matrix.hpp b/include/ginkgo/core/distributed/matrix.hpp index 10965ae4146..c5de9e2dfec 100644 --- a/include/ginkgo/core/distributed/matrix.hpp +++ b/include/ginkgo/core/distributed/matrix.hpp @@ -268,7 +268,7 @@ class Matrix Matrix>, public ConvertibleTo< Matrix, LocalIndexType, GlobalIndexType>>, -#if GKO_ENABLE_HALF +#if GINKGO_ENABLE_HALF public ConvertibleTo>, LocalIndexType, GlobalIndexType>>, #endif @@ -299,7 +299,7 @@ class Matrix void move_to(Matrix, local_index_type, global_index_type>* result) override; -#if GKO_ENABLE_HALF +#if GINKGO_ENABLE_HALF friend class Matrix>, LocalIndexType, GlobalIndexType>; diff --git a/include/ginkgo/core/distributed/vector.hpp b/include/ginkgo/core/distributed/vector.hpp index b4657582ae5..d83467a4078 100644 --- a/include/ginkgo/core/distributed/vector.hpp +++ b/include/ginkgo/core/distributed/vector.hpp @@ -88,7 +88,7 @@ class Vector : public EnableDistributedLinOp>, public EnableCreateMethod>, public ConvertibleTo>>, -#if GKO_ENABLE_HALF +#if GINKGO_ENABLE_HALF public ConvertibleTo>>>, #endif public EnableAbsoluteComputation>>, @@ -196,7 +196,7 @@ class Vector void move_to(Vector>* result) override; -#if GKO_ENABLE_HALF +#if GINKGO_ENABLE_HALF friend class Vector>>; void convert_to(Vector>>* result) @@ -654,8 +654,6 @@ struct conversion_target_helper> { using target_type = experimental::distributed::Vector; using source_type = experimental::distributed::Vector>; - using snd_source_type = experimental::distributed::Vector< - previous_precision>>; static std::unique_ptr create_empty(const source_type* source) { @@ -663,12 +661,17 @@ struct conversion_target_helper> { source->get_communicator()); } +#if GINKGO_ENABLE_HALF + using snd_source_type = experimental::distributed::Vector< + previous_precision>>; + static std::unique_ptr create_empty( const snd_source_type* source) { return target_type::create(source->get_executor(), source->get_communicator()); } +#endif }; diff --git a/include/ginkgo/core/matrix/coo.hpp b/include/ginkgo/core/matrix/coo.hpp index f12c637b4ba..22d81039546 100644 --- a/include/ginkgo/core/matrix/coo.hpp +++ b/include/ginkgo/core/matrix/coo.hpp @@ -78,7 +78,7 @@ template class Coo : public EnableLinOp>, public EnableCreateMethod>, public ConvertibleTo, IndexType>>, -#if GKO_ENABLE_HALF +#if GINKGO_ENABLE_HALF public ConvertibleTo< Coo>, IndexType>>, #endif @@ -121,7 +121,7 @@ class Coo : public EnableLinOp>, void move_to(Coo, IndexType>* result) override; -#if GKO_ENABLE_HALF +#if GINKGO_ENABLE_HALF friend class Coo>, IndexType>; diff --git a/include/ginkgo/core/matrix/csr.hpp b/include/ginkgo/core/matrix/csr.hpp index 4b46bb09f05..8015940ced0 100644 --- a/include/ginkgo/core/matrix/csr.hpp +++ b/include/ginkgo/core/matrix/csr.hpp @@ -127,7 +127,7 @@ template class Csr : public EnableLinOp>, public EnableCreateMethod>, public ConvertibleTo, IndexType>>, -#if GKO_ENABLE_HALF +#if GINKGO_ENABLE_HALF public ConvertibleTo< Csr>, IndexType>>, #endif @@ -727,7 +727,7 @@ class Csr : public EnableLinOp>, void move_to(Csr, IndexType>* result) override; -#if GKO_ENABLE_HALF +#if GINKGO_ENABLE_HALF friend class Csr>, IndexType>; diff --git a/include/ginkgo/core/matrix/dense.hpp b/include/ginkgo/core/matrix/dense.hpp index 464406a6d95..0daa5987188 100644 --- a/include/ginkgo/core/matrix/dense.hpp +++ b/include/ginkgo/core/matrix/dense.hpp @@ -108,7 +108,7 @@ class Dense : public EnableLinOp>, public EnableCreateMethod>, public ConvertibleTo>>, -#if GKO_ENABLE_HALF +#if GINKGO_ENABLE_HALF public ConvertibleTo>>>, #endif public ConvertibleTo>, @@ -305,7 +305,7 @@ class Dense void move_to(Dense>* result) override; -#if GKO_ENABLE_HALF +#if GINKGO_ENABLE_HALF friend class Dense>>; void convert_to(Dense>>* result) diff --git a/include/ginkgo/core/matrix/diagonal.hpp b/include/ginkgo/core/matrix/diagonal.hpp index e9bf45349f7..3202ea1ef07 100644 --- a/include/ginkgo/core/matrix/diagonal.hpp +++ b/include/ginkgo/core/matrix/diagonal.hpp @@ -71,7 +71,7 @@ class Diagonal public ConvertibleTo>, public ConvertibleTo>, public ConvertibleTo>>, -#if GKO_ENABLE_HALF +#if GINKGO_ENABLE_HALF public ConvertibleTo>>>, #endif public Transposable, @@ -114,7 +114,7 @@ class Diagonal void move_to(Diagonal>* result) override; -#if GKO_ENABLE_HALF +#if GINKGO_ENABLE_HALF friend class Diagonal>>; void convert_to(Diagonal>>* result) diff --git a/include/ginkgo/core/matrix/ell.hpp b/include/ginkgo/core/matrix/ell.hpp index d5bdf29b760..6c337a5b634 100644 --- a/include/ginkgo/core/matrix/ell.hpp +++ b/include/ginkgo/core/matrix/ell.hpp @@ -80,7 +80,7 @@ template class Ell : public EnableLinOp>, public EnableCreateMethod>, public ConvertibleTo, IndexType>>, -#if GKO_ENABLE_HALF +#if GINKGO_ENABLE_HALF public ConvertibleTo< Ell>, IndexType>>, #endif @@ -122,7 +122,7 @@ class Ell : public EnableLinOp>, void move_to(Ell, IndexType>* result) override; -#if GKO_ENABLE_HALF +#if GINKGO_ENABLE_HALF friend class Ell>, IndexType>; diff --git a/include/ginkgo/core/matrix/fbcsr.hpp b/include/ginkgo/core/matrix/fbcsr.hpp index f66ea976084..5f9f96ed64a 100644 --- a/include/ginkgo/core/matrix/fbcsr.hpp +++ b/include/ginkgo/core/matrix/fbcsr.hpp @@ -127,7 +127,7 @@ template class Fbcsr : public EnableLinOp>, public EnableCreateMethod>, public ConvertibleTo, IndexType>>, -#if GKO_ENABLE_HALF +#if GINKGO_ENABLE_HALF public ConvertibleTo< Fbcsr>, IndexType>>, #endif @@ -186,7 +186,7 @@ class Fbcsr : public EnableLinOp>, void move_to(Fbcsr, IndexType>* result) override; -#if GKO_ENABLE_HALF +#if GINKGO_ENABLE_HALF friend class Fbcsr>, IndexType>; diff --git a/include/ginkgo/core/matrix/hybrid.hpp b/include/ginkgo/core/matrix/hybrid.hpp index 700da8fb8f9..ec6be5ef82b 100644 --- a/include/ginkgo/core/matrix/hybrid.hpp +++ b/include/ginkgo/core/matrix/hybrid.hpp @@ -72,7 +72,7 @@ class Hybrid : public EnableLinOp>, public EnableCreateMethod>, public ConvertibleTo, IndexType>>, -#if GKO_ENABLE_HALF +#if GINKGO_ENABLE_HALF public ConvertibleTo< Hybrid>, IndexType>>, #endif @@ -397,7 +397,7 @@ class Hybrid void move_to(Hybrid, IndexType>* result) override; -#if GKO_ENABLE_HALF +#if GINKGO_ENABLE_HALF friend class Hybrid>, IndexType>; diff --git a/include/ginkgo/core/matrix/sellp.hpp b/include/ginkgo/core/matrix/sellp.hpp index 1866d75eb2d..9dcfe547734 100644 --- a/include/ginkgo/core/matrix/sellp.hpp +++ b/include/ginkgo/core/matrix/sellp.hpp @@ -71,7 +71,7 @@ template class Sellp : public EnableLinOp>, public EnableCreateMethod>, public ConvertibleTo, IndexType>>, -#if GKO_ENABLE_HALF +#if GINKGO_ENABLE_HALF public ConvertibleTo< Sellp>, IndexType>>, #endif @@ -113,7 +113,7 @@ class Sellp : public EnableLinOp>, void move_to(Sellp, IndexType>* result) override; -#if GKO_ENABLE_HALF +#if GINKGO_ENABLE_HALF friend class Sellp>, IndexType>; From 06a6f46a6b4d69dd3c348ba56a9caffaac8b4c64 Mon Sep 17 00:00:00 2001 From: "Yuhsiang M. Tsai" Date: Wed, 8 Feb 2023 21:46:30 +0100 Subject: [PATCH 18/48] clean and refine the code --- accessor/cuda_helper.hpp | 3 ++ common/cuda_hip/base/math.hpp.inc | 10 ----- .../distributed/matrix_kernels.hpp.inc | 8 ++-- .../unified/components/fill_array_kernels.cpp | 8 +++- core/stop/residual_norm.cpp | 4 +- cuda/CMakeLists.txt | 1 - cuda/base/types.hpp | 6 ++- cuda/matrix/csr_kernels.template.cu | 5 +-- cuda/solver/common_trs_kernels.cuh | 14 ++++--- dpcpp/CMakeLists.txt | 1 - dpcpp/components/cooperative_groups.dp.hpp | 6 --- dpcpp/matrix/csr_kernels.dp.cpp | 9 +---- dpcpp/matrix/dense_kernels.dp.cpp | 18 +-------- hip/CMakeLists.txt | 1 - hip/base/types.hip.hpp | 15 +++----- .../jacobi_generate_instantiate.inc.hip.cpp | 12 ------ hip/solver/idr_kernels.hip.cpp | 8 ++-- include/ginkgo/core/base/math.hpp | 31 +++++++--------- .../ginkgo/core/base/precision_dispatch.hpp | 37 ++++++------------- include/ginkgo/ginkgo.hpp | 1 + omp/CMakeLists.txt | 2 - omp/solver/idr_kernels.cpp | 12 +++--- reference/CMakeLists.txt | 2 - reference/matrix/diagonal_kernels.cpp | 1 - test/mpi/matrix.cpp | 4 +- 25 files changed, 75 insertions(+), 144 deletions(-) diff --git a/accessor/cuda_helper.hpp b/accessor/cuda_helper.hpp index c1e1696acbf..d11b934e90e 100644 --- a/accessor/cuda_helper.hpp +++ b/accessor/cuda_helper.hpp @@ -47,6 +47,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "utils.hpp" +struct __half; + + namespace gko { namespace acc { namespace detail { diff --git a/common/cuda_hip/base/math.hpp.inc b/common/cuda_hip/base/math.hpp.inc index 54a165b8494..fa2850c10d6 100644 --- a/common/cuda_hip/base/math.hpp.inc +++ b/common/cuda_hip/base/math.hpp.inc @@ -49,16 +49,6 @@ struct remove_complex_impl> { }; -// template -// struct is_complex_impl> -// : public std::integral_constant {}; - - -// template -// struct is_complex_or_scalar_impl> -// : is_complex_or_scalar_impl {}; - - template struct truncate_type_impl> { using type = thrust::complex::type>; diff --git a/common/cuda_hip/distributed/matrix_kernels.hpp.inc b/common/cuda_hip/distributed/matrix_kernels.hpp.inc index 7457af72267..47c0aab04e4 100644 --- a/common/cuda_hip/distributed/matrix_kernels.hpp.inc +++ b/common/cuda_hip/distributed/matrix_kernels.hpp.inc @@ -150,10 +150,10 @@ void build_local_nonlocal( local_values.resize_and_reset(num_local_elements); auto local_it = thrust::make_transform_iterator( input_it, [map_to_local_row, map_to_local_col] __host__ __device__( - const input_type input2) { - auto local_row = map_to_local_row(input2.row, input2.row_range); - auto local_col = map_to_local_col(input2.col, input2.col_range); - return thrust::make_tuple(local_row, local_col, input2.val); + const input_type input) { + auto local_row = map_to_local_row(input.row, input.row_range); + auto local_col = map_to_local_col(input.col, input.col_range); + return thrust::make_tuple(local_row, local_col, input.val); }); thrust::copy_if( thrust_policy(exec), local_it, local_it + input.get_num_elems(), diff --git a/common/unified/components/fill_array_kernels.cpp b/common/unified/components/fill_array_kernels.cpp index bb6ad681503..04167661a4d 100644 --- a/common/unified/components/fill_array_kernels.cpp +++ b/common/unified/components/fill_array_kernels.cpp @@ -60,9 +60,13 @@ template void fill_seq_array(std::shared_ptr exec, ValueType* array, size_type n) { + // __half only has long long not int64_t run_kernel( - exec, [] GKO_KERNEL(auto idx, auto array) { array[idx] = static_cast(idx); }, n, - array); + exec, + [] GKO_KERNEL(auto idx, auto array) { + array[idx] = static_cast(idx); + }, + n, array); } GKO_INSTANTIATE_FOR_EACH_TEMPLATE_TYPE(GKO_DECLARE_FILL_SEQ_ARRAY_KERNEL); diff --git a/core/stop/residual_norm.cpp b/core/stop/residual_norm.cpp index e59364ddff1..ee02c8042d2 100644 --- a/core/stop/residual_norm.cpp +++ b/core/stop/residual_norm.cpp @@ -127,8 +127,8 @@ ResidualNormBase::ResidualNormBase( baseline_{baseline}, system_matrix_{args.system_matrix}, b_{args.b}, - one_{gko::initialize({one()}, exec)}, - neg_one_{gko::initialize({-one()}, exec)} + one_{gko::initialize({1}, exec)}, + neg_one_{gko::initialize({-1}, exec)} { switch (baseline_) { case mode::initial_resnorm: { diff --git a/cuda/CMakeLists.txt b/cuda/CMakeLists.txt index f882c31c1e7..4c972d2a584 100644 --- a/cuda/CMakeLists.txt +++ b/cuda/CMakeLists.txt @@ -114,7 +114,6 @@ endif() target_compile_options(ginkgo_cuda PRIVATE $<$:${GINKGO_CUDA_COMPILER_FLAGS}>) target_compile_options(ginkgo_cuda PRIVATE $<$:${GINKGO_COMPILER_FLAGS}>) -target_compile_definitions(ginkgo_cuda PRIVATE GINKGO_COMPILE_KERNEL=1) ginkgo_compile_features(ginkgo_cuda) target_compile_definitions(ginkgo_cuda PRIVATE GKO_COMPILING_CUDA) diff --git a/cuda/base/types.hpp b/cuda/base/types.hpp index a65307016e0..27b0f95e9da 100644 --- a/cuda/base/types.hpp +++ b/cuda/base/types.hpp @@ -94,7 +94,8 @@ THRUST_HALF_FRIEND_OPERATOR(/, /=) namespace gko { -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700 +// from the cuda_fp16.hpp +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 template <> @@ -127,7 +128,8 @@ __device__ __forceinline__ bool is_nan(const thrust::complex<__half>& val) namespace kernels { namespace cuda { -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700 + +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 __device__ __forceinline__ __half abs(const __half& val) { return __habs(val); } diff --git a/cuda/matrix/csr_kernels.template.cu b/cuda/matrix/csr_kernels.template.cu index 657b2a3a1ca..1b4b20a1e75 100644 --- a/cuda/matrix/csr_kernels.template.cu +++ b/cuda/matrix/csr_kernels.template.cu @@ -258,9 +258,8 @@ void classical_spmv(syn::value_list, exec->get_num_multiprocessor() * classical_oversubscription; const auto gridx = - ceildiv(a->get_size()[0], spmv_block_size / subwarp_size); - // std::min(ceildiv(a->get_size()[0], spmv_block_size / subwarp_size), - // int64(nwarps / warps_in_block)); + std::min(ceildiv(a->get_size()[0], spmv_block_size / subwarp_size), + int64(nwarps / warps_in_block)); const dim3 grid(gridx, b->get_size()[1]); const auto block = spmv_block_size; diff --git a/cuda/solver/common_trs_kernels.cuh b/cuda/solver/common_trs_kernels.cuh index 97587a7d2cd..26d17f466e8 100644 --- a/cuda/solver/common_trs_kernels.cuh +++ b/cuda/solver/common_trs_kernels.cuh @@ -240,14 +240,15 @@ struct CudaSolveStruct : gko::solver::SolveStruct { policy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; size_type work_size{}; - // In nullptr is considered nullptr_t not casted to const ValueType* + // TODO: In nullptr is considered nullptr_t not casted to const + // ValueType* it works as expected now cusparse::buffer_size_ext( handle, algorithm, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_TRANSPOSE, matrix->get_size()[0], num_rhs, matrix->get_num_stored_elements(), one(), factor_descr, matrix->get_const_values(), matrix->get_const_row_ptrs(), - matrix->get_const_col_idxs(), (const ValueType*)(nullptr), num_rhs, - solve_info, policy, &work_size); + matrix->get_const_col_idxs(), nullptr, num_rhs, solve_info, policy, + &work_size); // allocate workspace work.resize_and_reset(work_size); @@ -257,8 +258,8 @@ struct CudaSolveStruct : gko::solver::SolveStruct { CUSPARSE_OPERATION_TRANSPOSE, matrix->get_size()[0], num_rhs, matrix->get_num_stored_elements(), one(), factor_descr, matrix->get_const_values(), matrix->get_const_row_ptrs(), - matrix->get_const_col_idxs(), (const ValueType*)(nullptr), num_rhs, - solve_info, policy, work.get_data()); + matrix->get_const_col_idxs(), nullptr, num_rhs, solve_info, policy, + work.get_data()); } void solve(const matrix::Csr* matrix, @@ -484,7 +485,8 @@ __global__ void sptrsv_naive_legacy_kernel( const auto row_end = is_upper ? rowptrs[row] - 1 : rowptrs[row + 1]; const int row_step = is_upper ? -1 : 1; - ValueType sum = ValueType{0.0}; + // no constructor from double to thrust<__half> + ValueType sum = zero(); auto j = row_begin; auto col = colidxs[j]; while (j != row_end) { diff --git a/dpcpp/CMakeLists.txt b/dpcpp/CMakeLists.txt index 35b71f7c54d..dd0d7c4cdfb 100644 --- a/dpcpp/CMakeLists.txt +++ b/dpcpp/CMakeLists.txt @@ -110,7 +110,6 @@ target_link_libraries(ginkgo_dpcpp PRIVATE MKL::MKL_DPCPP oneDPL) if (GINKGO_DPCPP_SINGLE_MODE) target_compile_definitions(ginkgo_dpcpp PRIVATE GINKGO_DPCPP_SINGLE_MODE=1) endif() -target_compile_definitions(ginkgo_dpcpp PRIVATE GINKGO_COMPILE_KERNEL=1) ginkgo_default_includes(ginkgo_dpcpp) ginkgo_install_library(ginkgo_dpcpp) diff --git a/dpcpp/components/cooperative_groups.dp.hpp b/dpcpp/components/cooperative_groups.dp.hpp index 908b062e692..e2212285954 100644 --- a/dpcpp/components/cooperative_groups.dp.hpp +++ b/dpcpp/components/cooperative_groups.dp.hpp @@ -43,12 +43,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "dpcpp/base/config.hpp" #include "dpcpp/base/dpct.hpp" -// namespace sycl { -// namespace detail { -// template <> -// struct is_arithmetic : public std::false_type {}; -// } // namespace detail -// } // namespace sycl namespace gko { namespace kernels { diff --git a/dpcpp/matrix/csr_kernels.dp.cpp b/dpcpp/matrix/csr_kernels.dp.cpp index bd84606e7fd..415acd2cdc8 100644 --- a/dpcpp/matrix/csr_kernels.dp.cpp +++ b/dpcpp/matrix/csr_kernels.dp.cpp @@ -62,6 +62,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "dpcpp/base/dim3.dp.hpp" #include "dpcpp/base/dpct.hpp" #include "dpcpp/base/helper.hpp" +#include "dpcpp/base/onemkl_bindings.hpp" #include "dpcpp/components/atomic.dp.hpp" #include "dpcpp/components/cooperative_groups.dp.hpp" #include "dpcpp/components/reduction.dp.hpp" @@ -1236,14 +1237,6 @@ void load_balance_spmv(std::shared_ptr exec, } } -template -struct onemkl_support : std::false_type {}; - -template <> -struct onemkl_support : std::true_type {}; - -template <> -struct onemkl_support : std::true_type {}; template bool try_general_sparselib_spmv(std::shared_ptr exec, diff --git a/dpcpp/matrix/dense_kernels.dp.cpp b/dpcpp/matrix/dense_kernels.dp.cpp index 86a45e12efa..a73bb226f3b 100644 --- a/dpcpp/matrix/dense_kernels.dp.cpp +++ b/dpcpp/matrix/dense_kernels.dp.cpp @@ -242,20 +242,6 @@ void compute_norm2_dispatch(std::shared_ptr exec, GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( GKO_DECLARE_DENSE_COMPUTE_NORM2_DISPATCH_KERNEL); -template -struct onemkl_support : std::false_type {}; - -template <> -struct onemkl_support : std::true_type {}; - -template <> -struct onemkl_support : std::true_type {}; - -template <> -struct onemkl_support> : std::true_type {}; - -template <> -struct onemkl_support> : std::true_type {}; template void simple_apply(std::shared_ptr exec, @@ -264,7 +250,7 @@ void simple_apply(std::shared_ptr exec, matrix::Dense* c) { using namespace oneapi::mkl; - if constexpr (onemkl_support::value) { + if constexpr (onemkl::is_supported::value) { if (b->get_stride() != 0 && c->get_stride() != 0) { if (a->get_size()[1] > 0) { oneapi::mkl::blas::row_major::gemm( @@ -292,7 +278,7 @@ void apply(std::shared_ptr exec, const matrix::Dense* beta, matrix::Dense* c) { using namespace oneapi::mkl; - if constexpr (onemkl_support::value) { + if constexpr (onemkl::is_supported::value) { if (b->get_stride() != 0 && c->get_stride() != 0) { if (a->get_size()[1] > 0) { oneapi::mkl::blas::row_major::gemm( diff --git a/hip/CMakeLists.txt b/hip/CMakeLists.txt index a28029fc441..779db13d36a 100644 --- a/hip/CMakeLists.txt +++ b/hip/CMakeLists.txt @@ -127,7 +127,6 @@ if (GINKGO_HAVE_ROCTX) endif() target_compile_options(ginkgo_hip PRIVATE $<$:${GINKGO_COMPILER_FLAGS}>) -target_compile_definitions(ginkgo_hip PRIVATE GINKGO_COMPILE_KERNEL=1) if(GINKGO_HIP_PLATFORM MATCHES "${HIP_PLATFORM_AMD_REGEX}") find_package(hip REQUIRED) diff --git a/hip/base/types.hip.hpp b/hip/base/types.hip.hpp index 79ecaa9c93f..2f9adac46d6 100644 --- a/hip/base/types.hip.hpp +++ b/hip/base/types.hip.hpp @@ -56,19 +56,17 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // thrust calls the c function not the function from std // Maybe override the function from thrust directlry -GKO_ATTRIBUTES GKO_INLINE __half hypot(__half a, __half b) +__device__ __forceinline__ __half hypot(__half a, __half b) { return hypot(static_cast(a), static_cast(b)); } -GKO_ATTRIBUTES GKO_INLINE thrust::complex<__half> sqrt( +__device__ __forceinline__ thrust::complex<__half> sqrt( thrust::complex<__half> a) { return sqrt(static_cast>(a)); } -// __device__ __forceinline__ float sqrt(float val) { return sqrtf(val); } -// __device__ __forceinline__ double sqrt(double val) { return ::sqrt(val); } __device__ __forceinline__ thrust::complex sqrt( thrust::complex val) { @@ -118,7 +116,8 @@ THRUST_HALF_FRIEND_OPERATOR(/, /=) namespace gko { #if defined(__CUDA_ARCH__) -#if __CUDA_ARCH__ >= 700 +// from the cuda_fp16.hpp +#if __CUDA_ARCH__ >= 530 __device__ __forceinline__ bool is_nan(const __half& val) { return __hisnan(val); @@ -137,7 +136,7 @@ __device__ __forceinline__ __half abs(const __half& val) } #endif -#elif defined(__HIP_DEVICE_COMPILE__) +#else // Not nvidia device __device__ __forceinline__ bool is_nan(const __half& val) { return __hisnan(val); @@ -148,10 +147,6 @@ __device__ __forceinline__ __half abs(const __half& val) { return __habs(val); } #endif -// #if defined(__HIPCC__) - -// #endif - namespace kernels { namespace hip { diff --git a/hip/preconditioner/jacobi_generate_instantiate.inc.hip.cpp b/hip/preconditioner/jacobi_generate_instantiate.inc.hip.cpp index fa970818622..8f79cafd427 100644 --- a/hip/preconditioner/jacobi_generate_instantiate.inc.hip.cpp +++ b/hip/preconditioner/jacobi_generate_instantiate.inc.hip.cpp @@ -53,18 +53,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. namespace gko { -// namespace detail { -// #if !defined(__HIP_DEVICE_COMPILE__) -// template <> -// struct basic_float_traits<__half> { -// using type = __half; -// static constexpr int sign_bits = 1; -// static constexpr int significand_bits = 10; -// static constexpr int exponent_bits = 5; -// static constexpr bool rounds_to_nearest = true; -// }; -// #endif -// } // namespace detail namespace kernels { namespace hip { /** diff --git a/hip/solver/idr_kernels.hip.cpp b/hip/solver/idr_kernels.hip.cpp index 8d106b6c962..9e6f353abe4 100644 --- a/hip/solver/idr_kernels.hip.cpp +++ b/hip/solver/idr_kernels.hip.cpp @@ -102,10 +102,10 @@ void initialize_subspace_vectors(std::shared_ptr exec, auto gen = hiprand::rand_generator(std::random_device{}(), HIPRAND_RNG_PSEUDO_DEFAULT, exec->get_stream()); - // hiprand::rand_vector( - // gen, - // subspace_vectors->get_size()[0] * subspace_vectors->get_stride(), - // 0.0, 1.0, subspace_vectors->get_values()); + hiprand::rand_vector( + gen, + subspace_vectors->get_size()[0] * subspace_vectors->get_stride(), + 0.0, 1.0, subspace_vectors->get_values()); } } diff --git a/include/ginkgo/core/base/math.hpp b/include/ginkgo/core/base/math.hpp index 75d0de9a511..a568715f506 100644 --- a/include/ginkgo/core/base/math.hpp +++ b/include/ginkgo/core/base/math.hpp @@ -519,6 +519,11 @@ struct arth_type { using type = float; }; +template +struct arth_type> { + using type = std::complex::type>; +}; + template struct infinity_impl { // CUDA doesn't allow us to call std::numeric_limits functions @@ -733,7 +738,7 @@ GKO_INLINE GKO_ATTRIBUTES constexpr int64 ceildiv(int64 num, int64 den) template GKO_INLINE __host__ constexpr T zero() { - return T(0.0); + return T{}; } @@ -761,7 +766,7 @@ GKO_INLINE __host__ constexpr T zero(const T&) template GKO_INLINE __host__ constexpr T one() { - return T(1.0); + return T(1); } @@ -791,7 +796,7 @@ GKO_INLINE __device__ constexpr std::enable_if_t< !std::is_same>>::value, T> zero() { - return T(0.0); + return T{}; } @@ -821,7 +826,7 @@ GKO_INLINE __device__ constexpr std::enable_if_t< !std::is_same>>::value, T> one() { - return T(1.0); + return T(1); } @@ -852,7 +857,7 @@ GKO_INLINE __device__ constexpr T one(const T&) template GKO_INLINE GKO_ATTRIBUTES constexpr T zero() { - return T(0.0); + return T{}; } @@ -1181,7 +1186,7 @@ GKO_INLINE GKO_ATTRIBUTES constexpr xstd::enable_if_t::value, T> abs(const T& x) { - return x >= zero() ? x : static_cast(-x); + return x >= zero() ? x : -x; } @@ -1362,21 +1367,12 @@ GKO_INLINE GKO_ATTRIBUTES std::enable_if_t::value, bool> is_nan( */ template GKO_INLINE GKO_ATTRIBUTES constexpr std::enable_if_t< - !is_complex_s::value && !std::is_same::value, T> + !is_complex_s::value, typename detail::arth_type::type> nan() { return std::numeric_limits::quiet_NaN(); } -template -GKO_INLINE GKO_ATTRIBUTES constexpr std::enable_if_t< - std::is_same::value, float> -nan() -{ - return std::numeric_limits::quiet_NaN(); -} - - /** * Returns a complex with both components quiet NaN. * @@ -1385,7 +1381,8 @@ nan() * @return complex{NaN, NaN}. */ template -GKO_INLINE GKO_ATTRIBUTES constexpr std::enable_if_t::value, T> +GKO_INLINE GKO_ATTRIBUTES constexpr std::enable_if_t< + is_complex_s::value, typename detail::arth_type::type> nan() { return T{nan>(), nan>()}; diff --git a/include/ginkgo/core/base/precision_dispatch.hpp b/include/ginkgo/core/base/precision_dispatch.hpp index e6968756b1b..771cdc02af2 100644 --- a/include/ginkgo/core/base/precision_dispatch.hpp +++ b/include/ginkgo/core/base/precision_dispatch.hpp @@ -77,18 +77,13 @@ make_temporary_conversion(Ptr&& matrix) using Pointee = detail::pointee; using Dense = matrix::Dense; using NextDense = matrix::Dense>; + using NextNextDense = matrix::Dense>>; using MaybeConstDense = std::conditional_t::value, const Dense, Dense>; auto result = detail::temporary_conversion< - MaybeConstDense>::template create(matrix); + MaybeConstDense>::template create(matrix); if (!result) { - result = detail::temporary_conversion>:: - template create< - matrix::Dense>>>( - matrix); - if (!result) { - GKO_NOT_SUPPORTED(matrix); - } + GKO_NOT_SUPPORTED(matrix); } return result; } @@ -386,16 +381,11 @@ make_temporary_conversion(LinOp* matrix) auto result = detail::temporary_conversion< experimental::distributed::Vector>:: template create< - experimental::distributed::Vector>>( - matrix); - if (!result) { - result = detail::temporary_conversion< - experimental::distributed::Vector>:: - template create>, + experimental::distributed::Vector< next_precision>>>(matrix); - if (!result) { - GKO_NOT_SUPPORTED(matrix); - } + if (!result) { + GKO_NOT_SUPPORTED(matrix); } return result; } @@ -411,16 +401,11 @@ make_temporary_conversion(const LinOp* matrix) auto result = detail::temporary_conversion< const experimental::distributed::Vector>:: template create< - experimental::distributed::Vector>>( - matrix); - if (!result) { - result = detail::temporary_conversion< - const experimental::distributed::Vector>:: - template create>, + experimental::distributed::Vector< next_precision>>>(matrix); - if (!result) { - GKO_NOT_SUPPORTED(matrix); - } + if (!result) { + GKO_NOT_SUPPORTED(matrix); } return result; } diff --git a/include/ginkgo/ginkgo.hpp b/include/ginkgo/ginkgo.hpp index bcdaa5d2d20..40502df13a1 100644 --- a/include/ginkgo/ginkgo.hpp +++ b/include/ginkgo/ginkgo.hpp @@ -51,6 +51,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #include +#include #include #include #include diff --git a/omp/CMakeLists.txt b/omp/CMakeLists.txt index 2edf676bda1..50925c106bc 100644 --- a/omp/CMakeLists.txt +++ b/omp/CMakeLists.txt @@ -75,8 +75,6 @@ target_include_directories(ginkgo_omp PRIVATE "${OpenMP_CXX_INCLUDE_DIRS}") separate_arguments(OpenMP_SEP_FLAGS NATIVE_COMMAND "${OpenMP_CXX_FLAGS}") target_compile_options(ginkgo_omp PRIVATE "${OpenMP_SEP_FLAGS}") target_compile_options(ginkgo_omp PRIVATE "${GINKGO_COMPILER_FLAGS}") -target_compile_definitions(ginkgo_omp PRIVATE GINKGO_COMPILE_KERNEL=1) - # Need to link against ginkgo_cuda for the `raw_copy_to(CudaExecutor ...)` method target_link_libraries(ginkgo_omp PRIVATE ginkgo_cuda) # Need to link against ginkgo_hip for the `raw_copy_to(HipExecutor ...)` method diff --git a/omp/solver/idr_kernels.cpp b/omp/solver/idr_kernels.cpp index 6ae31a1dc27..465912b7b1c 100644 --- a/omp/solver/idr_kernels.cpp +++ b/omp/solver/idr_kernels.cpp @@ -167,16 +167,16 @@ void initialize(std::shared_ptr exec, const size_type nrhs, // Initialize and Orthonormalize P const auto num_rows = subspace_vectors->get_size()[0]; const auto num_cols = subspace_vectors->get_size()[1]; - // auto dist = - // std::normal_distribution>(0.0, 1.0); + auto dist = std::normal_distribution< + typename detail::arth_type>::type>(0.0, 1.0); auto seed = std::random_device{}(); auto gen = std::default_random_engine(seed); for (size_type row = 0; row < num_rows; row++) { if (!deterministic) { - // for (size_type col = 0; col < num_cols; col++) { - // subspace_vectors->at(row, col) = - // get_rand_value(dist, gen); - // } + for (size_type col = 0; col < num_cols; col++) { + subspace_vectors->at(row, col) = + get_rand_value(dist, gen); + } } for (size_type i = 0; i < row; i++) { diff --git a/reference/CMakeLists.txt b/reference/CMakeLists.txt index b857904415e..c445d9dad03 100644 --- a/reference/CMakeLists.txt +++ b/reference/CMakeLists.txt @@ -70,8 +70,6 @@ target_compile_options(ginkgo_reference PRIVATE "${GINKGO_COMPILER_FLAGS}") if (CMAKE_CXX_COMPILER_ID MATCHES "PGI|NVHPC") set_source_files_properties(preconditioner/jacobi_kernels.cpp PROPERTIES COMPILE_FLAGS "-O1") endif() - -target_compile_definitions(ginkgo_reference PRIVATE GINKGO_COMPILE_KERNEL=1) if (GINKGO_CHECK_CIRCULAR_DEPS) ginkgo_check_headers(ginkgo_reference "") endif() diff --git a/reference/matrix/diagonal_kernels.cpp b/reference/matrix/diagonal_kernels.cpp index 03ce332192f..61ccfdd0620 100644 --- a/reference/matrix/diagonal_kernels.cpp +++ b/reference/matrix/diagonal_kernels.cpp @@ -35,7 +35,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include -#include "core/base/extended_float.hpp" namespace gko { diff --git a/test/mpi/matrix.cpp b/test/mpi/matrix.cpp index 04b142a4132..adfe63e5770 100644 --- a/test/mpi/matrix.cpp +++ b/test/mpi/matrix.cpp @@ -446,7 +446,7 @@ TYPED_TEST(Matrix, CanConvertToNextPrecision) // If OtherT is more precise: 0, otherwise r auto residual = r::value < r::value ? gko::remove_complex{0} - : gko::remove_complex{static_cast>(r::value)}; + : gko::remove_complex{r::value}; this->dist_mat->convert_to(tmp); tmp->convert_to(res); @@ -473,7 +473,7 @@ TYPED_TEST(Matrix, CanMoveToNextPrecision) // If OtherT is more precise: 0, otherwise r auto residual = r::value < r::value ? gko::remove_complex{0} - : gko::remove_complex{static_cast>(r::value)}; + : gko::remove_complex{r::value}; this->dist_mat->move_to(tmp); tmp->convert_to(res); From 814dca4b528d51232edffb5c1b8db8c6c24bebe3 Mon Sep 17 00:00:00 2001 From: "Yuhsiang M. Tsai" Date: Wed, 8 Feb 2023 22:19:17 +0100 Subject: [PATCH 19/48] move half.hpp out of type.hpp --- core/base/device_matrix_data_kernels.hpp | 1 + core/base/extended_float.hpp | 1 + core/base/mixed_precision_types.hpp | 1 + core/base/mtx_io.cpp | 1 + core/base/utils.hpp | 1 + core/components/absolute_array_kernels.hpp | 1 + core/components/fill_array_kernels.hpp | 1 + core/components/format_conversion_kernels.hpp | 1 + core/components/precision_conversion_kernels.hpp | 1 + core/components/prefix_sum_kernels.hpp | 1 + core/components/reduce_array_kernels.hpp | 1 + core/distributed/matrix_kernels.hpp | 1 + core/factorization/cholesky_kernels.hpp | 1 + core/factorization/factorization_kernels.hpp | 1 + core/factorization/ilu_kernels.hpp | 1 + core/factorization/lu_kernels.hpp | 1 + core/factorization/par_ic_kernels.hpp | 1 + core/factorization/par_ict_kernels.hpp | 1 + core/factorization/par_ilu_kernels.hpp | 1 + core/factorization/par_ilut_kernels.hpp | 1 + core/matrix/coo_kernels.hpp | 1 + core/matrix/csr_kernels.hpp | 1 + core/matrix/csr_lookup.hpp | 1 + core/matrix/dense_kernels.hpp | 1 + core/matrix/diagonal_kernels.hpp | 1 + core/matrix/fbcsr_kernels.hpp | 1 + core/matrix/fft_kernels.hpp | 1 + core/matrix/row_gatherer.cpp | 1 + core/matrix/sparsity_csr_kernels.hpp | 1 + core/multigrid/pgm.cpp | 1 + core/preconditioner/jacobi_utils.hpp | 1 + core/reorder/rcm_kernels.hpp | 1 + core/solver/bicg_kernels.hpp | 1 + core/solver/bicgstab_kernels.hpp | 1 + core/solver/cb_gmres.cpp | 1 + core/solver/cb_gmres_accessor.hpp | 1 + core/solver/cb_gmres_kernels.hpp | 1 + core/solver/cg_kernels.hpp | 1 + core/solver/cgs_kernels.hpp | 1 + core/solver/common_gmres_kernels.hpp | 1 + core/solver/gmres_kernels.hpp | 1 + core/solver/idr_kernels.hpp | 1 + core/solver/ir_kernels.hpp | 1 + core/solver/multigrid.cpp | 1 + core/solver/multigrid_kernels.hpp | 1 + core/stop/criterion_kernels.hpp | 1 + core/stop/residual_norm_kernels.hpp | 1 + core/test/accessor/reduced_row_major_ginkgo.cpp | 3 +++ core/test/utils.hpp | 1 + core/test/utils/assertions.hpp | 1 + cuda/base/types.hpp | 1 + hip/base/types.hip.hpp | 1 + include/ginkgo/core/base/array.hpp | 1 + include/ginkgo/core/base/dim.hpp | 1 + include/ginkgo/core/base/exception.hpp | 1 + include/ginkgo/core/base/executor.hpp | 1 + include/ginkgo/core/base/half.hpp | 1 + include/ginkgo/core/base/index_set.hpp | 1 + include/ginkgo/core/base/intrinsics.hpp | 1 + include/ginkgo/core/base/lin_op.hpp | 1 + include/ginkgo/core/base/math.hpp | 7 +++++++ include/ginkgo/core/base/matrix_assembly_data.hpp | 1 + include/ginkgo/core/base/matrix_data.hpp | 1 + include/ginkgo/core/base/mpi.hpp | 1 + include/ginkgo/core/base/range.hpp | 1 + include/ginkgo/core/base/range_accessors.hpp | 1 + include/ginkgo/core/base/types.hpp | 2 +- include/ginkgo/core/base/utils_helper.hpp | 1 + include/ginkgo/core/base/version.hpp | 1 + include/ginkgo/core/distributed/partition.hpp | 1 + include/ginkgo/core/factorization/factorization.hpp | 1 + include/ginkgo/core/factorization/ic.hpp | 1 + include/ginkgo/core/factorization/ilu.hpp | 1 + include/ginkgo/core/factorization/par_ic.hpp | 1 + include/ginkgo/core/factorization/par_ict.hpp | 1 + include/ginkgo/core/factorization/par_ilu.hpp | 1 + include/ginkgo/core/factorization/par_ilut.hpp | 1 + include/ginkgo/core/log/logger.hpp | 1 + include/ginkgo/core/matrix/dense.hpp | 1 + include/ginkgo/core/matrix/permutation.hpp | 1 + include/ginkgo/core/matrix/row_gatherer.hpp | 1 + include/ginkgo/core/multigrid/fixed_coarsening.hpp | 1 + include/ginkgo/core/multigrid/pgm.hpp | 1 + include/ginkgo/core/reorder/rcm.hpp | 1 + include/ginkgo/core/reorder/scaled_reordered.hpp | 1 + include/ginkgo/core/solver/bicg.hpp | 1 + include/ginkgo/core/solver/bicgstab.hpp | 1 + include/ginkgo/core/solver/cb_gmres.hpp | 1 + include/ginkgo/core/solver/cg.hpp | 1 + include/ginkgo/core/solver/cgs.hpp | 1 + include/ginkgo/core/solver/fcg.hpp | 1 + include/ginkgo/core/solver/gmres.hpp | 1 + include/ginkgo/core/solver/idr.hpp | 1 + include/ginkgo/core/solver/ir.hpp | 1 + include/ginkgo/core/solver/multigrid.hpp | 1 + include/ginkgo/core/solver/triangular.hpp | 1 + include/ginkgo/core/stop/stopping_status.hpp | 1 + omp/components/atomic.hpp | 1 + 98 files changed, 106 insertions(+), 1 deletion(-) diff --git a/core/base/device_matrix_data_kernels.hpp b/core/base/device_matrix_data_kernels.hpp index 2ab06dec3ec..e942362934d 100644 --- a/core/base/device_matrix_data_kernels.hpp +++ b/core/base/device_matrix_data_kernels.hpp @@ -41,6 +41,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include #include diff --git a/core/base/extended_float.hpp b/core/base/extended_float.hpp index 2ed7c8b2626..fb01cfe7fe0 100644 --- a/core/base/extended_float.hpp +++ b/core/base/extended_float.hpp @@ -38,6 +38,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include #include #include diff --git a/core/base/mixed_precision_types.hpp b/core/base/mixed_precision_types.hpp index a8ba4a54e30..91aa9e4eefa 100644 --- a/core/base/mixed_precision_types.hpp +++ b/core/base/mixed_precision_types.hpp @@ -35,6 +35,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include #include diff --git a/core/base/mtx_io.cpp b/core/base/mtx_io.cpp index c89da00f365..660b3bea313 100644 --- a/core/base/mtx_io.cpp +++ b/core/base/mtx_io.cpp @@ -44,6 +44,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include #include #include diff --git a/core/base/utils.hpp b/core/base/utils.hpp index 6c5bfb783dd..4250d35e8ef 100644 --- a/core/base/utils.hpp +++ b/core/base/utils.hpp @@ -41,6 +41,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include #include #include #include diff --git a/core/components/absolute_array_kernels.hpp b/core/components/absolute_array_kernels.hpp index 94ec12e98a5..affa5f27eb0 100644 --- a/core/components/absolute_array_kernels.hpp +++ b/core/components/absolute_array_kernels.hpp @@ -38,6 +38,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include #include #include diff --git a/core/components/fill_array_kernels.hpp b/core/components/fill_array_kernels.hpp index 607e99d036e..3da114961c7 100644 --- a/core/components/fill_array_kernels.hpp +++ b/core/components/fill_array_kernels.hpp @@ -38,6 +38,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include #include diff --git a/core/components/format_conversion_kernels.hpp b/core/components/format_conversion_kernels.hpp index 76d5ad6000b..e46814b6351 100644 --- a/core/components/format_conversion_kernels.hpp +++ b/core/components/format_conversion_kernels.hpp @@ -39,6 +39,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include +#include #include diff --git a/core/components/precision_conversion_kernels.hpp b/core/components/precision_conversion_kernels.hpp index 13da41d72d8..1abf78d1c19 100644 --- a/core/components/precision_conversion_kernels.hpp +++ b/core/components/precision_conversion_kernels.hpp @@ -38,6 +38,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include #include #include diff --git a/core/components/prefix_sum_kernels.hpp b/core/components/prefix_sum_kernels.hpp index 09a34f5931b..277c13ff7ba 100644 --- a/core/components/prefix_sum_kernels.hpp +++ b/core/components/prefix_sum_kernels.hpp @@ -38,6 +38,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include #include diff --git a/core/components/reduce_array_kernels.hpp b/core/components/reduce_array_kernels.hpp index 5ff591e71df..845a77f5409 100644 --- a/core/components/reduce_array_kernels.hpp +++ b/core/components/reduce_array_kernels.hpp @@ -39,6 +39,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include +#include #include diff --git a/core/distributed/matrix_kernels.hpp b/core/distributed/matrix_kernels.hpp index bda7c30b88b..8a341cad9b0 100644 --- a/core/distributed/matrix_kernels.hpp +++ b/core/distributed/matrix_kernels.hpp @@ -36,6 +36,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include +#include #include #include #include diff --git a/core/factorization/cholesky_kernels.hpp b/core/factorization/cholesky_kernels.hpp index 009bed918a3..4ac2cfc23a3 100644 --- a/core/factorization/cholesky_kernels.hpp +++ b/core/factorization/cholesky_kernels.hpp @@ -38,6 +38,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include #include #include diff --git a/core/factorization/factorization_kernels.hpp b/core/factorization/factorization_kernels.hpp index 85d5fc5a3ae..9a240ad455b 100644 --- a/core/factorization/factorization_kernels.hpp +++ b/core/factorization/factorization_kernels.hpp @@ -38,6 +38,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include #include #include diff --git a/core/factorization/ilu_kernels.hpp b/core/factorization/ilu_kernels.hpp index 12209b0d1c5..d9337c5c5a0 100644 --- a/core/factorization/ilu_kernels.hpp +++ b/core/factorization/ilu_kernels.hpp @@ -41,6 +41,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include #include #include diff --git a/core/factorization/lu_kernels.hpp b/core/factorization/lu_kernels.hpp index d3e7aea8f08..1e41b9e9b85 100644 --- a/core/factorization/lu_kernels.hpp +++ b/core/factorization/lu_kernels.hpp @@ -38,6 +38,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include #include #include diff --git a/core/factorization/par_ic_kernels.hpp b/core/factorization/par_ic_kernels.hpp index 8a461501fc7..f5356fd334b 100644 --- a/core/factorization/par_ic_kernels.hpp +++ b/core/factorization/par_ic_kernels.hpp @@ -41,6 +41,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include #include #include #include diff --git a/core/factorization/par_ict_kernels.hpp b/core/factorization/par_ict_kernels.hpp index c6049a220f3..198d10db87a 100644 --- a/core/factorization/par_ict_kernels.hpp +++ b/core/factorization/par_ict_kernels.hpp @@ -41,6 +41,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include #include #include #include diff --git a/core/factorization/par_ilu_kernels.hpp b/core/factorization/par_ilu_kernels.hpp index 8a8bd96314f..51f43ea1e74 100644 --- a/core/factorization/par_ilu_kernels.hpp +++ b/core/factorization/par_ilu_kernels.hpp @@ -41,6 +41,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include #include #include diff --git a/core/factorization/par_ilut_kernels.hpp b/core/factorization/par_ilut_kernels.hpp index 98d908e5c83..b4c4747159d 100644 --- a/core/factorization/par_ilut_kernels.hpp +++ b/core/factorization/par_ilut_kernels.hpp @@ -41,6 +41,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include #include #include #include diff --git a/core/matrix/coo_kernels.hpp b/core/matrix/coo_kernels.hpp index 84db65e27fc..2527a6e675f 100644 --- a/core/matrix/coo_kernels.hpp +++ b/core/matrix/coo_kernels.hpp @@ -37,6 +37,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include #include #include #include diff --git a/core/matrix/csr_kernels.hpp b/core/matrix/csr_kernels.hpp index 42a92ca1b84..2b7f9e2befd 100644 --- a/core/matrix/csr_kernels.hpp +++ b/core/matrix/csr_kernels.hpp @@ -38,6 +38,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include #include #include #include diff --git a/core/matrix/csr_lookup.hpp b/core/matrix/csr_lookup.hpp index 733ef9214ba..a1cded747ce 100644 --- a/core/matrix/csr_lookup.hpp +++ b/core/matrix/csr_lookup.hpp @@ -37,6 +37,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include #include #include #include diff --git a/core/matrix/dense_kernels.hpp b/core/matrix/dense_kernels.hpp index 9a487fadeda..4cf8a1cac2a 100644 --- a/core/matrix/dense_kernels.hpp +++ b/core/matrix/dense_kernels.hpp @@ -40,6 +40,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include #include #include #include diff --git a/core/matrix/diagonal_kernels.hpp b/core/matrix/diagonal_kernels.hpp index 9d3e7901dc5..88a4b790458 100644 --- a/core/matrix/diagonal_kernels.hpp +++ b/core/matrix/diagonal_kernels.hpp @@ -37,6 +37,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include #include #include #include diff --git a/core/matrix/fbcsr_kernels.hpp b/core/matrix/fbcsr_kernels.hpp index c180527a216..6a8bfe259e9 100644 --- a/core/matrix/fbcsr_kernels.hpp +++ b/core/matrix/fbcsr_kernels.hpp @@ -38,6 +38,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include #include #include #include diff --git a/core/matrix/fft_kernels.hpp b/core/matrix/fft_kernels.hpp index 09e16dc8a1a..7de42cedc13 100644 --- a/core/matrix/fft_kernels.hpp +++ b/core/matrix/fft_kernels.hpp @@ -35,6 +35,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include #include #include diff --git a/core/matrix/row_gatherer.cpp b/core/matrix/row_gatherer.cpp index f2ec59da2e5..b3807f2514e 100644 --- a/core/matrix/row_gatherer.cpp +++ b/core/matrix/row_gatherer.cpp @@ -33,6 +33,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include #include diff --git a/core/matrix/sparsity_csr_kernels.hpp b/core/matrix/sparsity_csr_kernels.hpp index 8f80e738b91..d5cebdb7007 100644 --- a/core/matrix/sparsity_csr_kernels.hpp +++ b/core/matrix/sparsity_csr_kernels.hpp @@ -37,6 +37,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include #include #include diff --git a/core/multigrid/pgm.cpp b/core/multigrid/pgm.cpp index 9df1b5d8870..a37a3f9050b 100644 --- a/core/multigrid/pgm.cpp +++ b/core/multigrid/pgm.cpp @@ -36,6 +36,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #include +#include #include #include #include diff --git a/core/preconditioner/jacobi_utils.hpp b/core/preconditioner/jacobi_utils.hpp index f929fcc5eba..bcf463775a9 100644 --- a/core/preconditioner/jacobi_utils.hpp +++ b/core/preconditioner/jacobi_utils.hpp @@ -34,6 +34,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GKO_CORE_PRECONDITIONER_JACOBI_UTILS_HPP_ +#include #include #include diff --git a/core/reorder/rcm_kernels.hpp b/core/reorder/rcm_kernels.hpp index 4fde334a26b..5f4b329c554 100644 --- a/core/reorder/rcm_kernels.hpp +++ b/core/reorder/rcm_kernels.hpp @@ -41,6 +41,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include #include #include #include diff --git a/core/solver/bicg_kernels.hpp b/core/solver/bicg_kernels.hpp index 6f22feb9446..6f1244de6fa 100644 --- a/core/solver/bicg_kernels.hpp +++ b/core/solver/bicg_kernels.hpp @@ -38,6 +38,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include #include #include #include diff --git a/core/solver/bicgstab_kernels.hpp b/core/solver/bicgstab_kernels.hpp index bdd2a18db48..81cb41fa605 100644 --- a/core/solver/bicgstab_kernels.hpp +++ b/core/solver/bicgstab_kernels.hpp @@ -38,6 +38,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include #include #include #include diff --git a/core/solver/cb_gmres.cpp b/core/solver/cb_gmres.cpp index 282295f261b..353e3703d2d 100644 --- a/core/solver/cb_gmres.cpp +++ b/core/solver/cb_gmres.cpp @@ -40,6 +40,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #include +#include #include #include #include diff --git a/core/solver/cb_gmres_accessor.hpp b/core/solver/cb_gmres_accessor.hpp index 125dc5e901c..0743b706453 100644 --- a/core/solver/cb_gmres_accessor.hpp +++ b/core/solver/cb_gmres_accessor.hpp @@ -44,6 +44,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #include +#include #include #include #include diff --git a/core/solver/cb_gmres_kernels.hpp b/core/solver/cb_gmres_kernels.hpp index a0040bc24cf..f584a364e5e 100644 --- a/core/solver/cb_gmres_kernels.hpp +++ b/core/solver/cb_gmres_kernels.hpp @@ -36,6 +36,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include +#include #include #include #include diff --git a/core/solver/cg_kernels.hpp b/core/solver/cg_kernels.hpp index d1eb99ef6ec..81b83007667 100644 --- a/core/solver/cg_kernels.hpp +++ b/core/solver/cg_kernels.hpp @@ -38,6 +38,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include #include #include #include diff --git a/core/solver/cgs_kernels.hpp b/core/solver/cgs_kernels.hpp index a618e8d7a9a..9d44540f347 100644 --- a/core/solver/cgs_kernels.hpp +++ b/core/solver/cgs_kernels.hpp @@ -38,6 +38,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include #include #include #include diff --git a/core/solver/common_gmres_kernels.hpp b/core/solver/common_gmres_kernels.hpp index bde667b79d8..a1288301145 100644 --- a/core/solver/common_gmres_kernels.hpp +++ b/core/solver/common_gmres_kernels.hpp @@ -35,6 +35,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include #include #include #include diff --git a/core/solver/gmres_kernels.hpp b/core/solver/gmres_kernels.hpp index bd236f8a158..8d0ef899fee 100644 --- a/core/solver/gmres_kernels.hpp +++ b/core/solver/gmres_kernels.hpp @@ -35,6 +35,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include #include #include #include diff --git a/core/solver/idr_kernels.hpp b/core/solver/idr_kernels.hpp index 1db367622b6..7fbf73f325e 100644 --- a/core/solver/idr_kernels.hpp +++ b/core/solver/idr_kernels.hpp @@ -35,6 +35,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include #include #include #include diff --git a/core/solver/ir_kernels.hpp b/core/solver/ir_kernels.hpp index ef4633d61f0..b29d624dac6 100644 --- a/core/solver/ir_kernels.hpp +++ b/core/solver/ir_kernels.hpp @@ -39,6 +39,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include +#include #include #include diff --git a/core/solver/multigrid.cpp b/core/solver/multigrid.cpp index 57c04b8e95e..7e41c02780e 100644 --- a/core/solver/multigrid.cpp +++ b/core/solver/multigrid.cpp @@ -39,6 +39,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #include +#include #include #include #include diff --git a/core/solver/multigrid_kernels.hpp b/core/solver/multigrid_kernels.hpp index 4869fd0ddad..fa246283b36 100644 --- a/core/solver/multigrid_kernels.hpp +++ b/core/solver/multigrid_kernels.hpp @@ -35,6 +35,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include #include #include #include diff --git a/core/stop/criterion_kernels.hpp b/core/stop/criterion_kernels.hpp index 8d4fb395841..7a9d537fe8a 100644 --- a/core/stop/criterion_kernels.hpp +++ b/core/stop/criterion_kernels.hpp @@ -35,6 +35,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include #include #include diff --git a/core/stop/residual_norm_kernels.hpp b/core/stop/residual_norm_kernels.hpp index f9c2ce89f93..c17f9dabfd8 100644 --- a/core/stop/residual_norm_kernels.hpp +++ b/core/stop/residual_norm_kernels.hpp @@ -35,6 +35,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include #include #include #include diff --git a/core/test/accessor/reduced_row_major_ginkgo.cpp b/core/test/accessor/reduced_row_major_ginkgo.cpp index b12fba6ad0f..d6649e5f4c7 100644 --- a/core/test/accessor/reduced_row_major_ginkgo.cpp +++ b/core/test/accessor/reduced_row_major_ginkgo.cpp @@ -40,6 +40,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include // necessary for gko::half + + #include "accessor/index_span.hpp" #include "accessor/range.hpp" #include "accessor/reduced_row_major.hpp" diff --git a/core/test/utils.hpp b/core/test/utils.hpp index 874daafb137..fd1056df2e8 100644 --- a/core/test/utils.hpp +++ b/core/test/utils.hpp @@ -45,6 +45,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include #include #include #include diff --git a/core/test/utils/assertions.hpp b/core/test/utils/assertions.hpp index a8767ba5526..7bf5db1b5e9 100644 --- a/core/test/utils/assertions.hpp +++ b/core/test/utils/assertions.hpp @@ -50,6 +50,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include +#include #include #include #include diff --git a/cuda/base/types.hpp b/cuda/base/types.hpp index 27b0f95e9da..bb1f1a34463 100644 --- a/cuda/base/types.hpp +++ b/cuda/base/types.hpp @@ -47,6 +47,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include #include diff --git a/hip/base/types.hip.hpp b/hip/base/types.hip.hpp index 2f9adac46d6..32a71fe57fc 100644 --- a/hip/base/types.hip.hpp +++ b/hip/base/types.hip.hpp @@ -51,6 +51,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include #include diff --git a/include/ginkgo/core/base/array.hpp b/include/ginkgo/core/base/array.hpp index 1140f1e400c..b52f13d08d8 100644 --- a/include/ginkgo/core/base/array.hpp +++ b/include/ginkgo/core/base/array.hpp @@ -44,6 +44,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #include +#include #include #include diff --git a/include/ginkgo/core/base/dim.hpp b/include/ginkgo/core/base/dim.hpp index c70c5f054ec..b42bad41d54 100644 --- a/include/ginkgo/core/base/dim.hpp +++ b/include/ginkgo/core/base/dim.hpp @@ -37,6 +37,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include #include diff --git a/include/ginkgo/core/base/exception.hpp b/include/ginkgo/core/base/exception.hpp index 8b270ed7a98..e3885135b11 100644 --- a/include/ginkgo/core/base/exception.hpp +++ b/include/ginkgo/core/base/exception.hpp @@ -38,6 +38,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include #include diff --git a/include/ginkgo/core/base/executor.hpp b/include/ginkgo/core/base/executor.hpp index c7195501178..e18984fb866 100644 --- a/include/ginkgo/core/base/executor.hpp +++ b/include/ginkgo/core/base/executor.hpp @@ -48,6 +48,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include +#include #include #include #include diff --git a/include/ginkgo/core/base/half.hpp b/include/ginkgo/core/base/half.hpp index 09b3c7a0686..1ae96bd942d 100644 --- a/include/ginkgo/core/base/half.hpp +++ b/include/ginkgo/core/base/half.hpp @@ -38,6 +38,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include #include #include diff --git a/include/ginkgo/core/base/index_set.hpp b/include/ginkgo/core/base/index_set.hpp index 281690b7807..647671378f7 100644 --- a/include/ginkgo/core/base/index_set.hpp +++ b/include/ginkgo/core/base/index_set.hpp @@ -43,6 +43,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #include +#include #include #include diff --git a/include/ginkgo/core/base/intrinsics.hpp b/include/ginkgo/core/base/intrinsics.hpp index 2366c824316..b9264c39030 100644 --- a/include/ginkgo/core/base/intrinsics.hpp +++ b/include/ginkgo/core/base/intrinsics.hpp @@ -37,6 +37,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include #include diff --git a/include/ginkgo/core/base/lin_op.hpp b/include/ginkgo/core/base/lin_op.hpp index c06c43bbb6e..bdf78767e95 100644 --- a/include/ginkgo/core/base/lin_op.hpp +++ b/include/ginkgo/core/base/lin_op.hpp @@ -43,6 +43,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #include +#include #include #include #include diff --git a/include/ginkgo/core/base/math.hpp b/include/ginkgo/core/base/math.hpp index a568715f506..55f4761dee2 100644 --- a/include/ginkgo/core/base/math.hpp +++ b/include/ginkgo/core/base/math.hpp @@ -43,6 +43,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include #include #include @@ -56,9 +57,14 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. class __half; + namespace thrust { + + template class complex; + + } namespace std { @@ -70,6 +76,7 @@ inline gko::half abs(std::complex a) return gko::half(sqrt(float(a.real() * a.real() + a.imag() * a.imag()))); } + inline gko::half sqrt(gko::half a) { return gko::half(sqrt(float(a))); } inline std::complex sqrt(std::complex a) diff --git a/include/ginkgo/core/base/matrix_assembly_data.hpp b/include/ginkgo/core/base/matrix_assembly_data.hpp index 3ea112094f2..5aa4419aa2d 100644 --- a/include/ginkgo/core/base/matrix_assembly_data.hpp +++ b/include/ginkgo/core/base/matrix_assembly_data.hpp @@ -42,6 +42,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include #include #include #include diff --git a/include/ginkgo/core/base/matrix_data.hpp b/include/ginkgo/core/base/matrix_data.hpp index f823dfc6b76..fbe60c7f07e 100644 --- a/include/ginkgo/core/base/matrix_data.hpp +++ b/include/ginkgo/core/base/matrix_data.hpp @@ -41,6 +41,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include #include #include #include diff --git a/include/ginkgo/core/base/mpi.hpp b/include/ginkgo/core/base/mpi.hpp index 42653015725..54157663879 100644 --- a/include/ginkgo/core/base/mpi.hpp +++ b/include/ginkgo/core/base/mpi.hpp @@ -43,6 +43,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #include +#include #include #include diff --git a/include/ginkgo/core/base/range.hpp b/include/ginkgo/core/base/range.hpp index 1e4c7a5d00e..cdd9af420ee 100644 --- a/include/ginkgo/core/base/range.hpp +++ b/include/ginkgo/core/base/range.hpp @@ -37,6 +37,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include #include #include #include diff --git a/include/ginkgo/core/base/range_accessors.hpp b/include/ginkgo/core/base/range_accessors.hpp index 20934afcdf4..546d1891cd7 100644 --- a/include/ginkgo/core/base/range_accessors.hpp +++ b/include/ginkgo/core/base/range_accessors.hpp @@ -37,6 +37,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include #include #include diff --git a/include/ginkgo/core/base/types.hpp b/include/ginkgo/core/base/types.hpp index 3ad8d6684d4..cd4933ab205 100644 --- a/include/ginkgo/core/base/types.hpp +++ b/include/ginkgo/core/base/types.hpp @@ -862,5 +862,5 @@ using comm_index_type = int; } // namespace experimental } // namespace gko -#include + #endif // GKO_PUBLIC_CORE_BASE_TYPES_HPP_ diff --git a/include/ginkgo/core/base/utils_helper.hpp b/include/ginkgo/core/base/utils_helper.hpp index 3f26d5d7659..95890f4bd6d 100644 --- a/include/ginkgo/core/base/utils_helper.hpp +++ b/include/ginkgo/core/base/utils_helper.hpp @@ -40,6 +40,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include #include #include #include diff --git a/include/ginkgo/core/base/version.hpp b/include/ginkgo/core/base/version.hpp index caa0cbe0761..299d8fe93f1 100644 --- a/include/ginkgo/core/base/version.hpp +++ b/include/ginkgo/core/base/version.hpp @@ -38,6 +38,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include #include diff --git a/include/ginkgo/core/distributed/partition.hpp b/include/ginkgo/core/distributed/partition.hpp index bb36528a4a8..f3eebf68283 100644 --- a/include/ginkgo/core/distributed/partition.hpp +++ b/include/ginkgo/core/distributed/partition.hpp @@ -35,6 +35,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include #include #include diff --git a/include/ginkgo/core/factorization/factorization.hpp b/include/ginkgo/core/factorization/factorization.hpp index 65b551c35f2..c16a65c4704 100644 --- a/include/ginkgo/core/factorization/factorization.hpp +++ b/include/ginkgo/core/factorization/factorization.hpp @@ -35,6 +35,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include #include #include #include diff --git a/include/ginkgo/core/factorization/ic.hpp b/include/ginkgo/core/factorization/ic.hpp index d3f0ac27926..a48e076d852 100644 --- a/include/ginkgo/core/factorization/ic.hpp +++ b/include/ginkgo/core/factorization/ic.hpp @@ -38,6 +38,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include #include #include #include diff --git a/include/ginkgo/core/factorization/ilu.hpp b/include/ginkgo/core/factorization/ilu.hpp index 98d36ee9d87..a6341f09ba8 100644 --- a/include/ginkgo/core/factorization/ilu.hpp +++ b/include/ginkgo/core/factorization/ilu.hpp @@ -38,6 +38,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include #include #include #include diff --git a/include/ginkgo/core/factorization/par_ic.hpp b/include/ginkgo/core/factorization/par_ic.hpp index 2df350f31a2..fd5abbb726b 100644 --- a/include/ginkgo/core/factorization/par_ic.hpp +++ b/include/ginkgo/core/factorization/par_ic.hpp @@ -38,6 +38,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include #include #include #include diff --git a/include/ginkgo/core/factorization/par_ict.hpp b/include/ginkgo/core/factorization/par_ict.hpp index 173136fa682..573666969ad 100644 --- a/include/ginkgo/core/factorization/par_ict.hpp +++ b/include/ginkgo/core/factorization/par_ict.hpp @@ -38,6 +38,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include #include #include #include diff --git a/include/ginkgo/core/factorization/par_ilu.hpp b/include/ginkgo/core/factorization/par_ilu.hpp index 878721afbd5..954b8a484cb 100644 --- a/include/ginkgo/core/factorization/par_ilu.hpp +++ b/include/ginkgo/core/factorization/par_ilu.hpp @@ -38,6 +38,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include #include #include #include diff --git a/include/ginkgo/core/factorization/par_ilut.hpp b/include/ginkgo/core/factorization/par_ilut.hpp index 76f3789a44e..cc8b17c281c 100644 --- a/include/ginkgo/core/factorization/par_ilut.hpp +++ b/include/ginkgo/core/factorization/par_ilut.hpp @@ -38,6 +38,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include #include #include #include diff --git a/include/ginkgo/core/log/logger.hpp b/include/ginkgo/core/log/logger.hpp index b700e1e703a..4c8a9981e10 100644 --- a/include/ginkgo/core/log/logger.hpp +++ b/include/ginkgo/core/log/logger.hpp @@ -41,6 +41,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include #include #include diff --git a/include/ginkgo/core/matrix/dense.hpp b/include/ginkgo/core/matrix/dense.hpp index 0daa5987188..5076ec923bb 100644 --- a/include/ginkgo/core/matrix/dense.hpp +++ b/include/ginkgo/core/matrix/dense.hpp @@ -41,6 +41,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #include +#include #include #include #include diff --git a/include/ginkgo/core/matrix/permutation.hpp b/include/ginkgo/core/matrix/permutation.hpp index 163160a2af6..0ccc24004ee 100644 --- a/include/ginkgo/core/matrix/permutation.hpp +++ b/include/ginkgo/core/matrix/permutation.hpp @@ -44,6 +44,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #include +#include #include #include #include diff --git a/include/ginkgo/core/matrix/row_gatherer.hpp b/include/ginkgo/core/matrix/row_gatherer.hpp index 3baedce4806..4317dfee51b 100644 --- a/include/ginkgo/core/matrix/row_gatherer.hpp +++ b/include/ginkgo/core/matrix/row_gatherer.hpp @@ -44,6 +44,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #include +#include #include #include #include diff --git a/include/ginkgo/core/multigrid/fixed_coarsening.hpp b/include/ginkgo/core/multigrid/fixed_coarsening.hpp index 3c5c3998536..c168e973a24 100644 --- a/include/ginkgo/core/multigrid/fixed_coarsening.hpp +++ b/include/ginkgo/core/multigrid/fixed_coarsening.hpp @@ -39,6 +39,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include +#include #include #include #include diff --git a/include/ginkgo/core/multigrid/pgm.hpp b/include/ginkgo/core/multigrid/pgm.hpp index a90507ce740..16d77aa2e11 100644 --- a/include/ginkgo/core/multigrid/pgm.hpp +++ b/include/ginkgo/core/multigrid/pgm.hpp @@ -39,6 +39,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include +#include #include #include #include diff --git a/include/ginkgo/core/reorder/rcm.hpp b/include/ginkgo/core/reorder/rcm.hpp index 72ba6827f2b..5ffe80b8524 100644 --- a/include/ginkgo/core/reorder/rcm.hpp +++ b/include/ginkgo/core/reorder/rcm.hpp @@ -40,6 +40,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #include +#include #include #include #include diff --git a/include/ginkgo/core/reorder/scaled_reordered.hpp b/include/ginkgo/core/reorder/scaled_reordered.hpp index 3c4f6efbbd7..9269106eb07 100644 --- a/include/ginkgo/core/reorder/scaled_reordered.hpp +++ b/include/ginkgo/core/reorder/scaled_reordered.hpp @@ -36,6 +36,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include +#include #include #include #include diff --git a/include/ginkgo/core/solver/bicg.hpp b/include/ginkgo/core/solver/bicg.hpp index c7b47a0e807..3bb1a69e350 100644 --- a/include/ginkgo/core/solver/bicg.hpp +++ b/include/ginkgo/core/solver/bicg.hpp @@ -39,6 +39,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include +#include #include #include #include diff --git a/include/ginkgo/core/solver/bicgstab.hpp b/include/ginkgo/core/solver/bicgstab.hpp index 214e669b2ff..eef2e454698 100644 --- a/include/ginkgo/core/solver/bicgstab.hpp +++ b/include/ginkgo/core/solver/bicgstab.hpp @@ -39,6 +39,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include +#include #include #include #include diff --git a/include/ginkgo/core/solver/cb_gmres.hpp b/include/ginkgo/core/solver/cb_gmres.hpp index a2dbb1efce1..9cf6c3913ae 100644 --- a/include/ginkgo/core/solver/cb_gmres.hpp +++ b/include/ginkgo/core/solver/cb_gmres.hpp @@ -40,6 +40,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include +#include #include #include #include diff --git a/include/ginkgo/core/solver/cg.hpp b/include/ginkgo/core/solver/cg.hpp index bc0861cf270..b57abe73467 100644 --- a/include/ginkgo/core/solver/cg.hpp +++ b/include/ginkgo/core/solver/cg.hpp @@ -39,6 +39,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include +#include #include #include #include diff --git a/include/ginkgo/core/solver/cgs.hpp b/include/ginkgo/core/solver/cgs.hpp index 22f81d8a292..57f9c8a9735 100644 --- a/include/ginkgo/core/solver/cgs.hpp +++ b/include/ginkgo/core/solver/cgs.hpp @@ -39,6 +39,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include +#include #include #include #include diff --git a/include/ginkgo/core/solver/fcg.hpp b/include/ginkgo/core/solver/fcg.hpp index cad7a29fc27..e13529eb38c 100644 --- a/include/ginkgo/core/solver/fcg.hpp +++ b/include/ginkgo/core/solver/fcg.hpp @@ -39,6 +39,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include +#include #include #include #include diff --git a/include/ginkgo/core/solver/gmres.hpp b/include/ginkgo/core/solver/gmres.hpp index d7d0f57a8a4..95dbdba0d1d 100644 --- a/include/ginkgo/core/solver/gmres.hpp +++ b/include/ginkgo/core/solver/gmres.hpp @@ -39,6 +39,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include +#include #include #include #include diff --git a/include/ginkgo/core/solver/idr.hpp b/include/ginkgo/core/solver/idr.hpp index fc677f33171..5ab0cb17c3f 100644 --- a/include/ginkgo/core/solver/idr.hpp +++ b/include/ginkgo/core/solver/idr.hpp @@ -41,6 +41,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include +#include #include #include #include diff --git a/include/ginkgo/core/solver/ir.hpp b/include/ginkgo/core/solver/ir.hpp index c5c69c1fb67..85ff3e970a6 100644 --- a/include/ginkgo/core/solver/ir.hpp +++ b/include/ginkgo/core/solver/ir.hpp @@ -38,6 +38,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include #include #include #include diff --git a/include/ginkgo/core/solver/multigrid.hpp b/include/ginkgo/core/solver/multigrid.hpp index 2d04a889445..cd4a3ed6f9a 100644 --- a/include/ginkgo/core/solver/multigrid.hpp +++ b/include/ginkgo/core/solver/multigrid.hpp @@ -42,6 +42,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include +#include #include #include #include diff --git a/include/ginkgo/core/solver/triangular.hpp b/include/ginkgo/core/solver/triangular.hpp index a05c8d62b84..96a8c58b040 100644 --- a/include/ginkgo/core/solver/triangular.hpp +++ b/include/ginkgo/core/solver/triangular.hpp @@ -42,6 +42,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #include +#include #include #include #include diff --git a/include/ginkgo/core/stop/stopping_status.hpp b/include/ginkgo/core/stop/stopping_status.hpp index ee7d7890cf4..addc06b3fbb 100644 --- a/include/ginkgo/core/stop/stopping_status.hpp +++ b/include/ginkgo/core/stop/stopping_status.hpp @@ -35,6 +35,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include #include diff --git a/omp/components/atomic.hpp b/omp/components/atomic.hpp index 3832d0d85ec..f2a0d9d5d86 100644 --- a/omp/components/atomic.hpp +++ b/omp/components/atomic.hpp @@ -37,6 +37,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include #include #include From 54755b4f06c603e08f9ac499c444e47055aca252 Mon Sep 17 00:00:00 2001 From: "Yuhsiang M. Tsai" Date: Wed, 8 Feb 2023 23:46:01 +0100 Subject: [PATCH 20/48] enable half for testing --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 350a5f296ff..2d8d65e354c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -29,7 +29,7 @@ option(GINKGO_BUILD_DOC "Generate documentation" OFF) option(GINKGO_FAST_TESTS "Reduces the input size for a few tests known to be time-intensive" OFF) option(GINKGO_TEST_NONDEFAULT_STREAM "Uses non-default streams in CUDA and HIP tests" OFF) option(GINKGO_MIXED_PRECISION "Instantiate true mixed-precision kernels (otherwise they will be conversion-based using implicit temporary storage)" OFF) -option(GINKGO_ENABLE_HALF "Enable the half operation" OFF) +option(GINKGO_ENABLE_HALF "Enable the half operation" ON) option(GINKGO_SKIP_DEPENDENCY_UPDATE "Do not update dependencies each time the project is rebuilt" ON) option(GINKGO_EXPORT_BUILD_DIR From cc3a7d5cd8ab58d9b45297f97fcece8fcde64299 Mon Sep 17 00:00:00 2001 From: "Yuhsiang M. Tsai" Date: Thu, 9 Feb 2023 00:15:04 +0100 Subject: [PATCH 21/48] __habs is added in cuda10.2 create_empty for its own type --- benchmark/CMakeLists.txt | 13 +++++-------- cuda/base/types.hpp | 8 +++++++- hip/base/types.hip.hpp | 13 ++++++++++--- include/ginkgo/core/distributed/vector.hpp | 9 +++++++++ 4 files changed, 31 insertions(+), 12 deletions(-) diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt index fd04620f595..50c24955b47 100644 --- a/benchmark/CMakeLists.txt +++ b/benchmark/CMakeLists.txt @@ -24,15 +24,12 @@ function(ginkgo_benchmark_cusparse_linops type def) endfunction() function(ginkgo_benchmark_hipsparse_linops type def) - add_library(hipsparse_linops_${type} utils/hip_linops.hip.cpp) + set_source_files_properties(utils/hip_linops.hip.cpp PROPERTIES HIP_SOURCE_PROPERTY_FORMAT TRUE) + hip_add_library(hipsparse_linops_${type} utils/hip_linops.hip.cpp + HIPCC_OPTIONS ${GINKGO_HIPCC_OPTIONS} -D${def} + CLANG_OPTIONS ${GINKGO_HIP_CLANG_OPTIONS} + NVCC_OPTIONS ${GINKGO_HIP_NVCC_OPTIONS}) target_compile_definitions(hipsparse_linops_${type} PUBLIC ${def}) - EXECUTE_PROCESS(COMMAND ${HIP_PATH}/bin/hipconfig --cpp_config OUTPUT_VARIABLE HIP_CXX_FLAGS) - set_target_properties(hipsparse_linops_${type} PROPERTIES COMPILE_FLAGS ${HIP_CXX_FLAGS}) - # use Thrust C++ device just for compilation, we don't use thrust::complex in the benchmarks - target_compile_definitions(hipsparse_linops_${type} PUBLIC -DTHRUST_DEVICE_SYSTEM=THRUST_DEVICE_SYSTEM_CPP) - target_include_directories(hipsparse_linops_${type} SYSTEM PRIVATE - ${HSA_HEADER} ${HIP_INCLUDE_DIRS} - ${HIPBLAS_INCLUDE_DIRS} ${HIPSPARSE_INCLUDE_DIRS}) target_link_libraries(hipsparse_linops_${type} Ginkgo::ginkgo ${HIPSPARSE_LIBRARIES}) endfunction() diff --git a/cuda/base/types.hpp b/cuda/base/types.hpp index bb1f1a34463..dd4e2b0929a 100644 --- a/cuda/base/types.hpp +++ b/cuda/base/types.hpp @@ -132,8 +132,14 @@ namespace cuda { #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 - +#if CUDA_VERSION >= 10020 __device__ __forceinline__ __half abs(const __half& val) { return __habs(val); } +#else +__device__ __forceinline__ __half abs(const __half& val) +{ + return abs(static_cast(val)); +} +#endif __device__ __forceinline__ __half sqrt(const __half& val) { return hsqrt(val); } diff --git a/hip/base/types.hip.hpp b/hip/base/types.hip.hpp index 32a71fe57fc..56fe09f3017 100644 --- a/hip/base/types.hip.hpp +++ b/hip/base/types.hip.hpp @@ -79,7 +79,7 @@ __device__ __forceinline__ thrust::complex sqrt( return thrust::sqrt(val); } -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 700 +#if GINKGO_HIP_PLATFORM_NVCC && defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 530 __device__ __forceinline__ __half sqrt(__half val) { return sqrt(static_cast(val)); @@ -116,16 +116,23 @@ THRUST_HALF_FRIEND_OPERATOR(/, /=) namespace gko { -#if defined(__CUDA_ARCH__) +#if GINKGO_HIP_PLATFORM_NVCC // from the cuda_fp16.hpp -#if __CUDA_ARCH__ >= 530 +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 __device__ __forceinline__ bool is_nan(const __half& val) { return __hisnan(val); } +#if CUDA_VERSION >= 10020 __device__ __forceinline__ __half abs(const __half& val) { return __habs(val); } #else +__device__ __forceinline__ __half abs(const __half& val) +{ + return abs(static_cast(val)); +} +#endif +#else __device__ __forceinline__ bool is_nan(const __half& val) { return is_nan(static_cast(val)); diff --git a/include/ginkgo/core/distributed/vector.hpp b/include/ginkgo/core/distributed/vector.hpp index d83467a4078..7538f057353 100644 --- a/include/ginkgo/core/distributed/vector.hpp +++ b/include/ginkgo/core/distributed/vector.hpp @@ -661,6 +661,15 @@ struct conversion_target_helper> { source->get_communicator()); } + // Allow to create_empty of the same type + // For distributed case, next> will be V in the candicated list. + // TODO: decide to whether to add this or add condition to the list + static std::unique_ptr create_empty(const target_type* source) + { + return target_type::create(source->get_executor(), + source->get_communicator()); + } + #if GINKGO_ENABLE_HALF using snd_source_type = experimental::distributed::Vector< previous_precision>>; From 7af58c9328a9b31f8ec674521a1d4738a06c0bbd Mon Sep 17 00:00:00 2001 From: "Yuhsiang M. Tsai" Date: Thu, 9 Feb 2023 22:38:16 +0100 Subject: [PATCH 22/48] fix nullptr and missing instantiation. sycl::half has different rule in conv and full operator after 5.7 --- core/test/base/extended_float.cpp | 30 ++++++++++++++++++++++++++++++ cuda/solver/common_trs_kernels.cuh | 10 +++++----- include/ginkgo/core/base/half.hpp | 11 +++++++++-- include/ginkgo/core/base/types.hpp | 29 +++++++++++++++++++++++++++-- 4 files changed, 71 insertions(+), 9 deletions(-) diff --git a/core/test/base/extended_float.cpp b/core/test/base/extended_float.cpp index bab3ac9926f..6901ae72152 100644 --- a/core/test/base/extended_float.cpp +++ b/core/test/base/extended_float.cpp @@ -140,7 +140,13 @@ TEST_F(FloatToHalf, ConvertsNan) { half x = create_from_bits("0" "11111111" "00000000000000000000001"); + #if defined(SYCL_LANGUAGE_VERSION) && \ + (__LIBSYCL_MAJOR_VERSION > 5 || (__LIBSYCL_MAJOR_VERSION == 5 && __LIBSYCL_MINOR_VERSION >= 7)) + // Sycl put the 1000000000, but ours put mask + ASSERT_EQ(get_bits(x), get_bits("0" "11111" "1000000000")); + #else ASSERT_EQ(get_bits(x), get_bits("0" "11111" "1111111111")); + #endif } @@ -148,7 +154,13 @@ TEST_F(FloatToHalf, ConvertsNegNan) { half x = create_from_bits("1" "11111111" "00010000000000000000000"); + #if defined(SYCL_LANGUAGE_VERSION) && \ + (__LIBSYCL_MAJOR_VERSION > 5 || (__LIBSYCL_MAJOR_VERSION == 5 && __LIBSYCL_MINOR_VERSION >= 7)) + // Sycl put the 1000000000, but ours put mask + ASSERT_EQ(get_bits(x), get_bits("1" "11111" "1000000000")); + #else ASSERT_EQ(get_bits(x), get_bits("1" "11111" "1111111111")); + #endif } @@ -196,7 +208,13 @@ TEST_F(FloatToHalf, TruncatesLargeNumber) { half x = create_from_bits("1" "10001110" "10010011111000010000100"); + #if defined(SYCL_LANGUAGE_VERSION) && \ + (__LIBSYCL_MAJOR_VERSION > 5 || (__LIBSYCL_MAJOR_VERSION == 5 && __LIBSYCL_MINOR_VERSION >= 7)) + // TODO: sycl::half seems to did rounding, but ours just truncates + ASSERT_EQ(get_bits(x), get_bits("1" "11110" "1001010000")); + #else ASSERT_EQ(get_bits(x), get_bits("1" "11110" "1001001111")); + #endif } @@ -246,7 +264,13 @@ TEST_F(HalfToFloat, ConvertsNan) { float x = create_from_bits("0" "11111" "0001001000"); + #if defined(SYCL_LANGUAGE_VERSION) && \ + (__LIBSYCL_MAJOR_VERSION > 5 || (__LIBSYCL_MAJOR_VERSION == 5 && __LIBSYCL_MINOR_VERSION >= 7)) + // sycl keeps significand + ASSERT_EQ(get_bits(x), get_bits("0" "11111111" "00010010000000000000000")); + #else ASSERT_EQ(get_bits(x), get_bits("0" "11111111" "11111111111111111111111")); + #endif } @@ -254,7 +278,13 @@ TEST_F(HalfToFloat, ConvertsNegNan) { float x = create_from_bits("1" "11111" "0000000001"); + #if defined(SYCL_LANGUAGE_VERSION) && \ + (__LIBSYCL_MAJOR_VERSION > 5 || (__LIBSYCL_MAJOR_VERSION == 5 && __LIBSYCL_MINOR_VERSION >= 7)) + // sycl keeps significand + ASSERT_EQ(get_bits(x), get_bits("1" "11111111" "00000000010000000000000")); + #else ASSERT_EQ(get_bits(x), get_bits("1" "11111111" "11111111111111111111111")); + #endif } diff --git a/cuda/solver/common_trs_kernels.cuh b/cuda/solver/common_trs_kernels.cuh index 26d17f466e8..b8595eee9b5 100644 --- a/cuda/solver/common_trs_kernels.cuh +++ b/cuda/solver/common_trs_kernels.cuh @@ -241,14 +241,14 @@ struct CudaSolveStruct : gko::solver::SolveStruct { size_type work_size{}; // TODO: In nullptr is considered nullptr_t not casted to const - // ValueType* it works as expected now + // it does not work in cuda110/100 images cusparse::buffer_size_ext( handle, algorithm, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_TRANSPOSE, matrix->get_size()[0], num_rhs, matrix->get_num_stored_elements(), one(), factor_descr, matrix->get_const_values(), matrix->get_const_row_ptrs(), - matrix->get_const_col_idxs(), nullptr, num_rhs, solve_info, policy, - &work_size); + matrix->get_const_col_idxs(), (const ValueType*)(nullptr), num_rhs, + solve_info, policy, &work_size); // allocate workspace work.resize_and_reset(work_size); @@ -258,8 +258,8 @@ struct CudaSolveStruct : gko::solver::SolveStruct { CUSPARSE_OPERATION_TRANSPOSE, matrix->get_size()[0], num_rhs, matrix->get_num_stored_elements(), one(), factor_descr, matrix->get_const_values(), matrix->get_const_row_ptrs(), - matrix->get_const_col_idxs(), nullptr, num_rhs, solve_info, policy, - work.get_data()); + matrix->get_const_col_idxs(), (const ValueType*)(nullptr), num_rhs, + solve_info, policy, work.get_data()); } void solve(const matrix::Csr* matrix, diff --git a/include/ginkgo/core/base/half.hpp b/include/ginkgo/core/base/half.hpp index 1ae96bd942d..1a8c1e1dfd1 100644 --- a/include/ginkgo/core/base/half.hpp +++ b/include/ginkgo/core/base/half.hpp @@ -322,7 +322,12 @@ struct precision_converter { } // namespace detail -#ifdef SYCL_LANGUAGE_VERSION +// sycl::half miss the arithmetic operator to result float not half before 5.7 +// (2022-06). It leads ? half : half/half ambiguous The same issue is reported +// in https://github.com/intel/llvm/issues/6028 +#if defined(SYCL_LANGUAGE_VERSION) && \ + (__LIBSYCL_MAJOR_VERSION > 5 || \ + (__LIBSYCL_MAJOR_VERSION == 5 && __LIBSYCL_MINOR_VERSION >= 7)) using half = sycl::half; #else /** @@ -629,7 +634,9 @@ class complex { value_type imag_; }; -#ifndef SYCL_LANGUAGE_VERSION +#if !(defined(SYCL_LANGUAGE_VERSION) && \ + (__LIBSYCL_MAJOR_VERSION > 5 || \ + (__LIBSYCL_MAJOR_VERSION == 5 && __LIBSYCL_MINOR_VERSION >= 7))) template <> struct numeric_limits { static constexpr bool is_specialized{true}; diff --git a/include/ginkgo/core/base/types.hpp b/include/ginkgo/core/base/types.hpp index cd4933ab205..1765795f256 100644 --- a/include/ginkgo/core/base/types.hpp +++ b/include/ginkgo/core/base/types.hpp @@ -162,7 +162,9 @@ using uint64 = std::uint64_t; */ using uintptr = std::uintptr_t; -#ifdef SYCL_LANGUAGE_VERSION +#if defined(SYCL_LANGUAGE_VERSION) && \ + (__LIBSYCL_MAJOR_VERSION > 5 || \ + (__LIBSYCL_MAJOR_VERSION == 5 && __LIBSYCL_MINOR_VERSION >= 7)) using half = sycl::half; #else class half; @@ -428,7 +430,8 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, _enable_macro(CudaExecutor, cuda) -#if GINKGO_ENABLE_HALF +// cuda half operation is supported from arch 5.3 +#if GINKGO_ENABLE_HALF && (!defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 530) #define GKO_ADAPT_HF(_macro) template _macro #else #define GKO_ADAPT_HF(_macro) \ @@ -475,6 +478,7 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, #if GINKGO_DPCPP_SINGLE_MODE #define GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(_macro) \ GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE(_macro); \ + GKO_ADAPT_HF(_macro(std::complex)); \ template _macro(std::complex); \ template <> \ _macro(std::complex) GKO_NOT_IMPLEMENTED @@ -508,9 +512,11 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, template _macro(float, float); \ template <> \ _macro(double, double) GKO_NOT_IMPLEMENTED; \ + GKO_ADAPT_HF(_macro(std::complex, std::complex)); \ template _macro(std::complex, std::complex); \ template <> \ _macro(std::complex, std::complex) GKO_NOT_IMPLEMENTED; \ + GKO_ADAPT_HF(_macro(std::complex, half)); \ template _macro(std::complex, float); \ template <> \ _macro(std::complex, double) GKO_NOT_IMPLEMENTED; @@ -582,9 +588,11 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, #if GINKGO_DPCPP_SINGLE_MODE #define GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(_macro) \ GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_AND_INDEX_TYPE(_macro); \ + GKO_ADAPT_HF(_macro(std::complex, int32)); \ template _macro(std::complex, int32); \ template <> \ _macro(std::complex, int32) GKO_NOT_IMPLEMENTED; \ + GKO_ADAPT_HF(_macro(std::complex, int64)); \ template _macro(std::complex, int64); \ template <> \ _macro(std::complex, int64) GKO_NOT_IMPLEMENTED @@ -651,6 +659,9 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, #define GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(_macro) \ GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE( \ _macro); \ + GKO_ADAPT_HF(_macro(std::complex, int32, int32)); \ + GKO_ADAPT_HF(_macro(std::complex, int32, int64)); \ + GKO_ADAPT_HF(_macro(std::complex, int64, int64)); \ template _macro(std::complex, int32, int32); \ template _macro(std::complex, int32, int64); \ template _macro(std::complex, int64, int64); \ @@ -683,6 +694,18 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, template <> \ _macro(double, float) GKO_NOT_IMPLEMENTED; \ template <> \ + _macro(half, double) GKO_NOT_IMPLEMENTED; \ + template <> \ + _macro(double, half) GKO_NOT_IMPLEMENTED; \ + GKO_ADAPT_HF(_macro(float, half)); \ + GKO_ADAPT_HF(_macro(half, float)); \ + GKO_ADAPT_HF(_macro(std::complex, std::complex)); \ + template <> \ + _macro(std::complex, std::complex) GKO_NOT_IMPLEMENTED; \ + GKO_ADAPT_HF(_macro(std::complex, std::complex)); \ + template <> \ + _macro(std::complex, std::complex) GKO_NOT_IMPLEMENTED; \ + template <> \ _macro(std::complex, std::complex) GKO_NOT_IMPLEMENTED; \ template <> \ _macro(std::complex, std::complex) GKO_NOT_IMPLEMENTED @@ -690,9 +713,11 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, #define GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION_OR_COPY(_macro) \ GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION(_macro); \ + GKO_ADAPT_HF(_macro(half, half)); \ template _macro(float, float); \ template <> \ _macro(double, double) GKO_NOT_IMPLEMENTED; \ + GKO_ADAPT_HF(_macro(std::complex, std::complex)); \ template _macro(std::complex, std::complex); \ template <> \ _macro(std::complex, std::complex) GKO_NOT_IMPLEMENTED From 35a47fa46ed717036591be7448dbdd2132d1ae7f Mon Sep 17 00:00:00 2001 From: "Yu-Hsiang M. Tsai" Date: Thu, 23 Mar 2023 15:29:00 +0100 Subject: [PATCH 23/48] fix missing device_type and ptr_param --- common/cuda_hip/solver/multigrid_kernels.hpp.inc | 4 ++-- include/ginkgo/core/distributed/matrix.hpp | 5 +++++ include/ginkgo/core/distributed/vector.hpp | 4 ++++ include/ginkgo/core/matrix/coo.hpp | 4 ++++ include/ginkgo/core/matrix/csr.hpp | 4 ++++ include/ginkgo/core/matrix/dense.hpp | 4 ++++ include/ginkgo/core/matrix/diagonal.hpp | 4 ++++ include/ginkgo/core/matrix/ell.hpp | 4 ++++ include/ginkgo/core/matrix/fbcsr.hpp | 4 ++++ include/ginkgo/core/matrix/hybrid.hpp | 4 ++++ include/ginkgo/core/matrix/sellp.hpp | 4 ++++ 11 files changed, 43 insertions(+), 2 deletions(-) diff --git a/common/cuda_hip/solver/multigrid_kernels.hpp.inc b/common/cuda_hip/solver/multigrid_kernels.hpp.inc index 472187314a6..12ef1d6efb6 100644 --- a/common/cuda_hip/solver/multigrid_kernels.hpp.inc +++ b/common/cuda_hip/solver/multigrid_kernels.hpp.inc @@ -191,8 +191,8 @@ void kcycle_check_stop(std::shared_ptr exec, kernel::kcycle_check_stop_kernel<<get_stream()>>>( nrhs, as_device_type(old_norm->get_const_values()), - as_device_type(new_norm->get_const_values()), rel_tol, - as_device_type(dis_stop.get_data())); + as_device_type(new_norm->get_const_values()), + as_device_type(rel_tol), as_device_type(dis_stop.get_data())); } is_stop = exec->copy_val_to_host(dis_stop.get_const_data()); } diff --git a/include/ginkgo/core/distributed/matrix.hpp b/include/ginkgo/core/distributed/matrix.hpp index c5de9e2dfec..ba81c959660 100644 --- a/include/ginkgo/core/distributed/matrix.hpp +++ b/include/ginkgo/core/distributed/matrix.hpp @@ -302,6 +302,11 @@ class Matrix #if GINKGO_ENABLE_HALF friend class Matrix>, LocalIndexType, GlobalIndexType>; + using ConvertibleTo< + Matrix>, local_index_type, + global_index_type>>::convert_to; + using ConvertibleTo>, + local_index_type, global_index_type>>::move_to; void convert_to( Matrix>, local_index_type, diff --git a/include/ginkgo/core/distributed/vector.hpp b/include/ginkgo/core/distributed/vector.hpp index 7538f057353..b36bcd6444e 100644 --- a/include/ginkgo/core/distributed/vector.hpp +++ b/include/ginkgo/core/distributed/vector.hpp @@ -198,6 +198,10 @@ class Vector #if GINKGO_ENABLE_HALF friend class Vector>>; + using ConvertibleTo< + Vector>>>::convert_to; + using ConvertibleTo< + Vector>>>::move_to; void convert_to(Vector>>* result) const override; diff --git a/include/ginkgo/core/matrix/coo.hpp b/include/ginkgo/core/matrix/coo.hpp index 22d81039546..b3435a22648 100644 --- a/include/ginkgo/core/matrix/coo.hpp +++ b/include/ginkgo/core/matrix/coo.hpp @@ -124,6 +124,10 @@ class Coo : public EnableLinOp>, #if GINKGO_ENABLE_HALF friend class Coo>, IndexType>; + using ConvertibleTo< + Coo>, IndexType>>::convert_to; + using ConvertibleTo< + Coo>, IndexType>>::move_to; void convert_to(Coo>, IndexType>* result) const override; diff --git a/include/ginkgo/core/matrix/csr.hpp b/include/ginkgo/core/matrix/csr.hpp index 8015940ced0..d95b438b09a 100644 --- a/include/ginkgo/core/matrix/csr.hpp +++ b/include/ginkgo/core/matrix/csr.hpp @@ -730,6 +730,10 @@ class Csr : public EnableLinOp>, #if GINKGO_ENABLE_HALF friend class Csr>, IndexType>; + using ConvertibleTo< + Csr>, IndexType>>::convert_to; + using ConvertibleTo< + Csr>, IndexType>>::move_to; void convert_to(Csr>, IndexType>* result) const override; diff --git a/include/ginkgo/core/matrix/dense.hpp b/include/ginkgo/core/matrix/dense.hpp index 5076ec923bb..317f5cc5668 100644 --- a/include/ginkgo/core/matrix/dense.hpp +++ b/include/ginkgo/core/matrix/dense.hpp @@ -308,6 +308,10 @@ class Dense #if GINKGO_ENABLE_HALF friend class Dense>>; + using ConvertibleTo< + Dense>>>::convert_to; + using ConvertibleTo< + Dense>>>::move_to; void convert_to(Dense>>* result) const override; diff --git a/include/ginkgo/core/matrix/diagonal.hpp b/include/ginkgo/core/matrix/diagonal.hpp index 3202ea1ef07..d7ff95aa9f1 100644 --- a/include/ginkgo/core/matrix/diagonal.hpp +++ b/include/ginkgo/core/matrix/diagonal.hpp @@ -116,6 +116,10 @@ class Diagonal #if GINKGO_ENABLE_HALF friend class Diagonal>>; + using ConvertibleTo< + Diagonal>>>::convert_to; + using ConvertibleTo< + Diagonal>>>::move_to; void convert_to(Diagonal>>* result) const override; diff --git a/include/ginkgo/core/matrix/ell.hpp b/include/ginkgo/core/matrix/ell.hpp index 6c337a5b634..b696f8418e6 100644 --- a/include/ginkgo/core/matrix/ell.hpp +++ b/include/ginkgo/core/matrix/ell.hpp @@ -125,6 +125,10 @@ class Ell : public EnableLinOp>, #if GINKGO_ENABLE_HALF friend class Ell>, IndexType>; + using ConvertibleTo< + Ell>, IndexType>>::convert_to; + using ConvertibleTo< + Ell>, IndexType>>::move_to; void convert_to(Ell>, IndexType>* result) const override; diff --git a/include/ginkgo/core/matrix/fbcsr.hpp b/include/ginkgo/core/matrix/fbcsr.hpp index 5f9f96ed64a..05aa87833f5 100644 --- a/include/ginkgo/core/matrix/fbcsr.hpp +++ b/include/ginkgo/core/matrix/fbcsr.hpp @@ -189,6 +189,10 @@ class Fbcsr : public EnableLinOp>, #if GINKGO_ENABLE_HALF friend class Fbcsr>, IndexType>; + using ConvertibleTo>, + IndexType>>::convert_to; + using ConvertibleTo< + Fbcsr>, IndexType>>::move_to; void convert_to(Fbcsr>, IndexType>* result) const override; diff --git a/include/ginkgo/core/matrix/hybrid.hpp b/include/ginkgo/core/matrix/hybrid.hpp index ec6be5ef82b..cfa72d9a693 100644 --- a/include/ginkgo/core/matrix/hybrid.hpp +++ b/include/ginkgo/core/matrix/hybrid.hpp @@ -400,6 +400,10 @@ class Hybrid #if GINKGO_ENABLE_HALF friend class Hybrid>, IndexType>; + using ConvertibleTo>, + IndexType>>::convert_to; + using ConvertibleTo< + Hybrid>, IndexType>>::move_to; void convert_to(Hybrid>, IndexType>* result) const override; diff --git a/include/ginkgo/core/matrix/sellp.hpp b/include/ginkgo/core/matrix/sellp.hpp index 9dcfe547734..030301fe830 100644 --- a/include/ginkgo/core/matrix/sellp.hpp +++ b/include/ginkgo/core/matrix/sellp.hpp @@ -116,6 +116,10 @@ class Sellp : public EnableLinOp>, #if GINKGO_ENABLE_HALF friend class Sellp>, IndexType>; + using ConvertibleTo>, + IndexType>>::convert_to; + using ConvertibleTo< + Sellp>, IndexType>>::move_to; void convert_to(Sellp>, IndexType>* result) const override; From faea48d14c18ee4063bf9773f61ea6bfc78b0cdf Mon Sep 17 00:00:00 2001 From: "Yuhsiang M. Tsai" Date: Sat, 25 Mar 2023 09:30:48 +0100 Subject: [PATCH 24/48] update rounding --- core/test/base/extended_float.cpp | 25 ++++++++++++++----------- include/ginkgo/core/base/half.hpp | 17 +++++++++++++++-- test/components/fill_array_kernels.cpp | 11 ++++++++--- 3 files changed, 37 insertions(+), 16 deletions(-) diff --git a/core/test/base/extended_float.cpp b/core/test/base/extended_float.cpp index 6901ae72152..6098a70b728 100644 --- a/core/test/base/extended_float.cpp +++ b/core/test/base/extended_float.cpp @@ -204,18 +204,21 @@ TEST_F(FloatToHalf, TruncatesSmallNumber) } -TEST_F(FloatToHalf, TruncatesLargeNumber) +TEST_F(FloatToHalf, TruncatesLargeNumberRoundToEven) { - half x = create_from_bits("1" "10001110" "10010011111000010000100"); - - #if defined(SYCL_LANGUAGE_VERSION) && \ - (__LIBSYCL_MAJOR_VERSION > 5 || (__LIBSYCL_MAJOR_VERSION == 5 && __LIBSYCL_MINOR_VERSION >= 7)) - // TODO: sycl::half seems to did rounding, but ours just truncates - ASSERT_EQ(get_bits(x), get_bits("1" "11110" "1001010000")); - #else - ASSERT_EQ(get_bits(x), get_bits("1" "11110" "1001001111")); - #endif - + half neg_x = create_from_bits("1" "10001110" "10010011111000010000100"); + half neg_x2 = create_from_bits("1" "10001110" "10010011101000010000100"); + half x = create_from_bits("0" "10001110" "10010011111000010000100"); + half x2 = create_from_bits("0" "10001110" "10010011101000010000100"); + half x3 = create_from_bits("0" "10001110" "10010011101000000000000"); + half x4 = create_from_bits("0" "10001110" "10010011111000000000000"); + + EXPECT_EQ(get_bits(x), get_bits("0" "11110" "1001010000")); + EXPECT_EQ(get_bits(x2), get_bits("0" "11110" "1001001111")); + EXPECT_EQ(get_bits(x3), get_bits("0" "11110" "1001001110")); + EXPECT_EQ(get_bits(x4), get_bits("0" "11110" "1001010000")); + EXPECT_EQ(get_bits(neg_x), get_bits("1" "11110" "1001010000")); + EXPECT_EQ(get_bits(neg_x2), get_bits("1" "11110" "1001001111")); } diff --git a/include/ginkgo/core/base/half.hpp b/include/ginkgo/core/base/half.hpp index 1a8c1e1dfd1..446d085754d 100644 --- a/include/ginkgo/core/base/half.hpp +++ b/include/ginkgo/core/base/half.hpp @@ -462,8 +462,21 @@ class half { // TODO: handle denormals return conv::shift_sign(data_); } else { - return conv::shift_sign(data_) | exp | - conv::shift_significand(data_); + // Rounding to even + const auto result = conv::shift_sign(data_) | exp | + conv::shift_significand(data_); + // return result + ((result & 1) && + // ((data_ >> (f32_traits::significand_bits - + // f16_traits::significand_bits - 1)) & + // 1)); + const auto tail = + data_ & static_cast( + (1 << conv::significand_offset) - 1); + + constexpr auto half = static_cast( + 1 << (conv::significand_offset - 1)); + return result + + (tail > half || ((tail == half) && (result & 1))); } } } diff --git a/test/components/fill_array_kernels.cpp b/test/components/fill_array_kernels.cpp index 8ee0089c49c..bb7e195ad2c 100644 --- a/test/components/fill_array_kernels.cpp +++ b/test/components/fill_array_kernels.cpp @@ -53,7 +53,7 @@ class FillArray : public CommonTestFixture { protected: using value_type = T; FillArray() - : total_size(63531), + : total_size(3000), vals{ref, total_size}, dvals{exec, total_size}, seqs{ref, total_size} @@ -68,8 +68,8 @@ class FillArray : public CommonTestFixture { gko::array seqs; }; -TYPED_TEST_SUITE(FillArray, gko::test::ValueAndIndexTypes, - TypenameNameGenerator); +using LIST = ::testing::Types; +TYPED_TEST_SUITE(FillArray, LIST, TypenameNameGenerator); TYPED_TEST(FillArray, EqualsReference) @@ -88,5 +88,10 @@ TYPED_TEST(FillArray, FillSeqEqualsReference) gko::kernels::EXEC_NAMESPACE::components::fill_seq_array( this->exec, this->dvals.get_data(), this->total_size); + this->dvals.set_executor(this->ref); + for (gko::size_type i = 2000; i < this->total_size; i++) { + std::cout << i << " " << this->seqs.get_data()[i] << " device " + << this->dvals.get_data()[i] << std::endl; + } GKO_ASSERT_ARRAY_EQ(this->seqs, this->dvals); } From f1c118133a8b0eabf5688f4e233a14757d29c315 Mon Sep 17 00:00:00 2001 From: "Yuhsiang M. Tsai" Date: Mon, 27 Mar 2023 12:26:17 +0200 Subject: [PATCH 25/48] do not use distribution with half --- core/test/solver/gmres.cpp | 7 +- core/test/solver/multigrid.cpp | 4 +- core/test/utils.hpp | 69 ++++++++++++++----- core/test/utils/array_generator_test.cpp | 2 +- core/test/utils/fb_matrix_generator.hpp | 13 ++-- core/test/utils/fb_matrix_generator_test.cpp | 4 +- core/test/utils/matrix_generator_test.cpp | 16 ++--- core/test/utils/matrix_utils_test.cpp | 4 +- include/ginkgo/core/preconditioner/ilu.hpp | 3 +- .../test/factorization/par_ilut_kernels.cpp | 25 ++++--- reference/test/matrix/dense_kernels.cpp | 3 +- reference/test/matrix/fbcsr_kernels.cpp | 3 +- reference/test/matrix/fft_kernels.cpp | 3 +- reference/test/solver/direct.cpp | 2 +- reference/test/solver/multigrid_kernels.cpp | 2 +- test/base/device_matrix_data_kernels.cpp | 2 +- test/factorization/par_ic_kernels.cpp | 2 +- test/factorization/par_ict_kernels.cpp | 8 +-- test/factorization/par_ilu_kernels.cpp | 2 +- test/factorization/par_ilut_kernels.cpp | 28 +++----- test/matrix/fbcsr_kernels.cpp | 7 +- test/solver/direct.cpp | 4 +- 22 files changed, 118 insertions(+), 95 deletions(-) diff --git a/core/test/solver/gmres.cpp b/core/test/solver/gmres.cpp index 11cafe2c86f..3e54f7a6d04 100644 --- a/core/test/solver/gmres.cpp +++ b/core/test/solver/gmres.cpp @@ -60,8 +60,8 @@ class Gmres : public ::testing::Test { using Solver = gko::solver::Gmres; using Big_solver = gko::solver::Gmres; - static constexpr gko::remove_complex reduction_factor = - gko::remove_complex(1e-6); + // half does not have constexpr constructor + static const gko::remove_complex reduction_factor; Gmres() : exec(gko::ReferenceExecutor::create()), @@ -97,7 +97,8 @@ class Gmres : public ::testing::Test { }; template -constexpr gko::remove_complex Gmres::reduction_factor; +const gko::remove_complex Gmres::reduction_factor = + gko::remove_complex(1e-6); TYPED_TEST_SUITE(Gmres, gko::test::ValueTypes, TypenameNameGenerator); diff --git a/core/test/solver/multigrid.cpp b/core/test/solver/multigrid.cpp index 856f9651ebe..b8672ef7079 100644 --- a/core/test/solver/multigrid.cpp +++ b/core/test/solver/multigrid.cpp @@ -108,9 +108,7 @@ class DummyLinOpWithFactory std::make_shared(this->get_executor(), gko::dim<2>{n_, n_ - 1}), gko::share(gko::test::generate_random_dense_matrix( - n_ - 1, n_ - 1, - std::uniform_real_distribution>( - 0, 1), + n_ - 1, n_ - 1, std::uniform_real_distribution<>(0, 1), std::default_random_engine{}, factory->get_executor())), std::make_shared(this->get_executor(), gko::dim<2>{n_ - 1, n_})); diff --git a/core/test/utils.hpp b/core/test/utils.hpp index fd1056df2e8..74aad6fe5c1 100644 --- a/core/test/utils.hpp +++ b/core/test/utils.hpp @@ -65,12 +65,21 @@ namespace test { using ValueTypes = #if GINKGO_DPCPP_SINGLE_MODE - ::testing::Types>; + ::testing::Types>; #else - ::testing::Types, std::complex>; + ::testing::Types, + std::complex, std::complex>; #endif using ComplexValueTypes = +#if GINKGO_DPCPP_SINGLE_MODE + ::testing::Types, std::complex>; +#else + ::testing::Types, std::complex, + std::complex>; +#endif + +using ComplexValueTypesNoHalf = #if GINKGO_DPCPP_SINGLE_MODE ::testing::Types>; #else @@ -79,9 +88,9 @@ using ComplexValueTypes = using RealValueTypes = #if GINKGO_DPCPP_SINGLE_MODE - ::testing::Types; + ::testing::Types; #else - ::testing::Types; + ::testing::Types; #endif @@ -95,42 +104,53 @@ using LocalGlobalIndexTypes = using PODTypes = #if GINKGO_DPCPP_SINGLE_MODE - ::testing::Types; + ::testing::Types; #else - ::testing::Types; + ::testing::Types; #endif using ValueAndIndexTypes = #if GINKGO_DPCPP_SINGLE_MODE - ::testing::Types, gko::int32, gko::int64, + ::testing::Types, + std::complex, gko::int32, gko::int64, gko::size_type>; #else - ::testing::Types, std::complex, - gko::int32, gko::int64, gko::size_type>; + ::testing::Types, + std::complex, std::complex, gko::int32, + gko::int64, gko::size_type>; #endif using RealValueAndIndexTypes = #if GINKGO_DPCPP_SINGLE_MODE - ::testing::Types; + ::testing::Types; #else - ::testing::Types; + ::testing::Types; #endif using ValueIndexTypes = #if GINKGO_DPCPP_SINGLE_MODE - ::testing::Types, + ::testing::Types, + std::tuple, + std::tuple, gko::int32>, std::tuple, gko::int32>, + std::tuple, std::tuple, gko::int64>>; #else ::testing::Types< - std::tuple, std::tuple, + std::tuple, std::tuple, + std::tuple, + std::tuple, gko::int32>, + std::tuple, gko::int32>, std::tuple, gko::int32>, std::tuple, gko::int32>, - std::tuple, std::tuple, + std::tuple, std::tuple, + std::tuple, + std::tuple, gko::int64>, std::tuple, gko::int64>, std::tuple, gko::int64>>; #endif @@ -138,22 +158,28 @@ using ValueIndexTypes = using RealValueIndexTypes = #if GINKGO_DPCPP_SINGLE_MODE - ::testing::Types, - std::tuple>; + ::testing::Types< + std::tuple, std::tuple, + std::tuple, std::tuple>; #else ::testing::Types< - std::tuple, std::tuple, + std::tuple, std::tuple, + std::tuple, std::tuple, std::tuple, std::tuple>; #endif using ComplexValueIndexTypes = #if GINKGO_DPCPP_SINGLE_MODE - ::testing::Types, gko::int32>, + ::testing::Types, gko::int32>, + std::tuple, gko::int32>, + std::tuple, gko::int64>, std::tuple, gko::int64>>; #else - ::testing::Types, gko::int32>, + ::testing::Types, gko::int32>, + std::tuple, gko::int32>, std::tuple, gko::int32>, + std::tuple, gko::int64>, std::tuple, gko::int64>, std::tuple, gko::int64>>; #endif @@ -311,6 +337,11 @@ namespace detail { template struct next_precision_impl {}; +template <> +struct next_precision_impl { + using type = gko::half; +}; + template <> struct next_precision_impl { using type = double; diff --git a/core/test/utils/array_generator_test.cpp b/core/test/utils/array_generator_test.cpp index 72214c49d7c..018652f88a4 100644 --- a/core/test/utils/array_generator_test.cpp +++ b/core/test/utils/array_generator_test.cpp @@ -53,7 +53,7 @@ class ArrayGenerator : public ::testing::Test { ArrayGenerator() : exec(gko::ReferenceExecutor::create()) { array = gko::test::generate_random_array( - 500, std::normal_distribution>(20.0, 5.0), + 500, std::normal_distribution<>(20.0, 5.0), std::default_random_engine(42), exec); } diff --git a/core/test/utils/fb_matrix_generator.hpp b/core/test/utils/fb_matrix_generator.hpp index 7c43b0905c1..71c92f6b990 100644 --- a/core/test/utils/fb_matrix_generator.hpp +++ b/core/test/utils/fb_matrix_generator.hpp @@ -161,16 +161,15 @@ std::unique_ptr> generate_fbcsr_from_csr( const IndexType* const row_ptrs = fmtx->get_const_row_ptrs(); const IndexType* const col_idxs = fmtx->get_const_col_idxs(); ValueType* const vals = fmtx->get_values(); - std::uniform_real_distribution> - off_diag_dist(-1.0, 1.0); + std::uniform_real_distribution<> off_diag_dist(-1.0, 1.0); for (IndexType ibrow = 0; ibrow < nbrows; ibrow++) { if (row_diag_dominant) { const IndexType nrownz = (row_ptrs[ibrow + 1] - row_ptrs[ibrow]) * block_size; - std::uniform_real_distribution> - diag_dist(1.01 * nrownz, 2 * nrownz); + std::uniform_real_distribution<> diag_dist(1.01 * nrownz, + 2 * nrownz); for (IndexType ibz = row_ptrs[ibrow]; ibz < row_ptrs[ibrow + 1]; ibz++) { @@ -235,13 +234,11 @@ std::unique_ptr> generate_random_fbcsr( matrix::Csr>( nbrows, nbcols, std::uniform_int_distribution(0, nbcols - 1), - std::normal_distribution(0.0, 1.0), - std::move(engine), ref) + std::normal_distribution<>(0.0, 1.0), std::move(engine), ref) : generate_random_matrix>( nbrows, nbcols, std::uniform_int_distribution(0, nbcols - 1), - std::normal_distribution(0.0, 1.0), - std::move(engine), ref); + std::normal_distribution<>(0.0, 1.0), std::move(engine), ref); if (unsort && rand_csr_ref->is_sorted_by_column_index()) { unsort_matrix(rand_csr_ref, engine); } diff --git a/core/test/utils/fb_matrix_generator_test.cpp b/core/test/utils/fb_matrix_generator_test.cpp index fe11e27ac9d..ebb45fccefb 100644 --- a/core/test/utils/fb_matrix_generator_test.cpp +++ b/core/test/utils/fb_matrix_generator_test.cpp @@ -59,8 +59,8 @@ class BlockMatrixGenerator : public ::testing::Test { : exec(gko::ReferenceExecutor::create()), mtx(gko::test::generate_random_matrix< gko::matrix::Csr>( - nbrows, nbcols, std::normal_distribution(10, 5), - std::normal_distribution(20.0, 5.0), + nbrows, nbcols, std::normal_distribution<>(10, 5), + std::normal_distribution<>(20.0, 5.0), std::default_random_engine(42), exec)), rbmtx(gko::test::generate_fbcsr_from_csr( exec, mtx.get(), blk_sz, false, std::default_random_engine(42))), diff --git a/core/test/utils/matrix_generator_test.cpp b/core/test/utils/matrix_generator_test.cpp index 411d5ec17d0..c4820244231 100644 --- a/core/test/utils/matrix_generator_test.cpp +++ b/core/test/utils/matrix_generator_test.cpp @@ -57,25 +57,25 @@ class MatrixGenerator : public ::testing::Test { MatrixGenerator() : exec(gko::ReferenceExecutor::create()), mtx(gko::test::generate_random_matrix( - 500, 100, std::normal_distribution(50, 5), - std::normal_distribution(20.0, 5.0), + 500, 100, std::normal_distribution<>(50, 5), + std::normal_distribution<>(20.0, 5.0), std::default_random_engine(42), exec)), dense_mtx(gko::test::generate_random_dense_matrix( - 500, 100, std::normal_distribution(20.0, 5.0), + 500, 100, std::normal_distribution<>(20.0, 5.0), std::default_random_engine(41), exec)), l_mtx(gko::test::generate_random_lower_triangular_matrix( - 4, true, std::normal_distribution(50, 5), - std::normal_distribution(20.0, 5.0), + 4, true, std::normal_distribution<>(50, 5), + std::normal_distribution<>(20.0, 5.0), std::default_random_engine(42), exec)), u_mtx(gko::test::generate_random_upper_triangular_matrix( - 4, true, std::normal_distribution(50, 5), - std::normal_distribution(20.0, 5.0), + 4, true, std::normal_distribution<>(50, 5), + std::normal_distribution<>(20.0, 5.0), std::default_random_engine(42), exec)), lower_bandwidth(2), upper_bandwidth(3), band_mtx(gko::test::generate_random_band_matrix( 100, lower_bandwidth, upper_bandwidth, - std::normal_distribution(20.0, 5.0), + std::normal_distribution<>(20.0, 5.0), std::default_random_engine(42), exec)), nnz_per_row_sample(500, 0), values_sample(0), diff --git a/core/test/utils/matrix_utils_test.cpp b/core/test/utils/matrix_utils_test.cpp index 31a6072270e..28512266ff1 100644 --- a/core/test/utils/matrix_utils_test.cpp +++ b/core/test/utils/matrix_utils_test.cpp @@ -62,8 +62,8 @@ class MatrixUtils : public ::testing::Test { MatrixUtils() : exec(gko::ReferenceExecutor::create()), data(gko::test::generate_random_matrix_data( - 500, 500, std::normal_distribution(50, 5), - std::normal_distribution(20.0, 5.0), + 500, 500, std::normal_distribution<>(50, 5), + std::normal_distribution<>(20.0, 5.0), std::default_random_engine(42))), rectangular_data(gko::dim<2>(500, 100)) {} diff --git a/include/ginkgo/core/preconditioner/ilu.hpp b/include/ginkgo/core/preconditioner/ilu.hpp index 7db9d19c7c2..19a1759ef0d 100644 --- a/include/ginkgo/core/preconditioner/ilu.hpp +++ b/include/ginkgo/core/preconditioner/ilu.hpp @@ -388,7 +388,8 @@ class Ilu : public EnableLinOp< generate_default_solver(const std::shared_ptr& exec, const std::shared_ptr& mtx) { - constexpr gko::remove_complex default_reduce_residual{1e-4}; + // half can not use constexpr constructor + const gko::remove_complex default_reduce_residual{1e-4}; const unsigned int default_max_iters{ static_cast(mtx->get_size()[0])}; diff --git a/reference/test/factorization/par_ilut_kernels.cpp b/reference/test/factorization/par_ilut_kernels.cpp index 9da285ec3eb..3227e33cce6 100644 --- a/reference/test/factorization/par_ilut_kernels.cpp +++ b/reference/test/factorization/par_ilut_kernels.cpp @@ -86,6 +86,7 @@ class ParIlut : public ::testing::Test { using ComplexCsr = gko::matrix::Csr>, index_type>; + using complex_value_type = std::complex>; ParIlut() : ref(gko::ReferenceExecutor::create()), @@ -107,16 +108,24 @@ class ParIlut : public ::testing::Test { {0., -3., 0., 1.}}, ref)), mtx1_complex(gko::initialize( - {{{.1, 0.}, {0., 0.}, {0., 0.}, {0., 0.}}, - {{-1., .1}, {.1, -1.}, {0., 0.}, {0., 0.}}, - {{-1., 1.}, {-2., .2}, {-1., -.3}, {0., 0.}}, - {{1., -2.}, {-3., -.1}, {-1., .1}, {.1, 2.}}}, + {{complex_value_type{.1, 0.}, complex_value_type{0., 0.}, + complex_value_type{0., 0.}, complex_value_type{0., 0.}}, + {complex_value_type{-1., .1}, complex_value_type{.1, -1.}, + complex_value_type{0., 0.}, complex_value_type{0., 0.}}, + {complex_value_type{-1., 1.}, complex_value_type{-2., .2}, + complex_value_type{-1., -.3}, complex_value_type{0., 0.}}, + {complex_value_type{1., -2.}, complex_value_type{-3., -.1}, + complex_value_type{-1., .1}, complex_value_type{.1, 2.}}}, ref)), mtx1_expect_complex_thrm(gko::initialize( - {{{.1, 0.}, {0., 0.}, {0., 0.}, {0., 0.}}, - {{0., 0.}, {.1, -1.}, {0., 0.}, {0., 0.}}, - {{-1., 1.}, {-2., .2}, {-1., -.3}, {0., 0.}}, - {{1., -2.}, {-3., -.1}, {0., 0.}, {.1, 2.}}}, + {{complex_value_type{.1, 0.}, complex_value_type{0., 0.}, + complex_value_type{0., 0.}, complex_value_type{0., 0.}}, + {complex_value_type{0., 0.}, complex_value_type{.1, -1.}, + complex_value_type{0., 0.}, complex_value_type{0., 0.}}, + {complex_value_type{-1., 1.}, complex_value_type{-2., .2}, + complex_value_type{-1., -.3}, complex_value_type{0., 0.}}, + {complex_value_type{1., -2.}, complex_value_type{-3., -.1}, + complex_value_type{0., 0.}, complex_value_type{.1, 2.}}}, ref)), identity(gko::initialize( {{1., 0., 0.}, {0., 1., 0.}, {0., 0., 1.}}, ref)), diff --git a/reference/test/matrix/dense_kernels.cpp b/reference/test/matrix/dense_kernels.cpp index bb90097afa1..13142e0e0d4 100644 --- a/reference/test/matrix/dense_kernels.cpp +++ b/reference/test/matrix/dense_kernels.cpp @@ -106,8 +106,7 @@ class Dense : public ::testing::Test { return gko::test::generate_random_matrix( num_rows, num_cols, std::uniform_int_distribution(num_cols, num_cols), - std::normal_distribution>(0.0, 1.0), - rand_engine, exec); + std::normal_distribution<>(0.0, 1.0), rand_engine, exec); } }; diff --git a/reference/test/matrix/fbcsr_kernels.cpp b/reference/test/matrix/fbcsr_kernels.cpp index 2dea452c655..260796c197c 100644 --- a/reference/test/matrix/fbcsr_kernels.cpp +++ b/reference/test/matrix/fbcsr_kernels.cpp @@ -146,7 +146,8 @@ std::unique_ptr> get_some_vectors( { using RT = gko::remove_complex; std::default_random_engine engine(39); - std::normal_distribution dist(0.0, 5.0); + std::normal_distribution::type> dist( + 0.0, 5.0); std::uniform_int_distribution<> nnzdist(1, nrhs); return gko::test::generate_random_matrix>( nrows, nrhs, nnzdist, dist, engine, exec); diff --git a/reference/test/matrix/fft_kernels.cpp b/reference/test/matrix/fft_kernels.cpp index ba75582c6af..7cde806bed7 100644 --- a/reference/test/matrix/fft_kernels.cpp +++ b/reference/test/matrix/fft_kernels.cpp @@ -181,7 +181,8 @@ class Fft : public ::testing::Test { std::unique_ptr dense_ifft3; }; -TYPED_TEST_SUITE(Fft, gko::test::ComplexValueTypes, TypenameNameGenerator); +TYPED_TEST_SUITE(Fft, gko::test::ComplexValueTypesNoHalf, + TypenameNameGenerator); TYPED_TEST(Fft, ThrowsOnNonPowerOfTwo1D) diff --git a/reference/test/solver/direct.cpp b/reference/test/solver/direct.cpp index 617015bac1f..fab146135f5 100644 --- a/reference/test/solver/direct.cpp +++ b/reference/test/solver/direct.cpp @@ -81,7 +81,7 @@ class Direct : public ::testing::Test { .on(exec)) .on(exec); solver = factory->generate(mtx); - std::normal_distribution> dist(0, 1); + std::normal_distribution<> dist(0, 1); x = gko::test::generate_random_dense_matrix( mtx->get_size()[0], nrhs, dist, rng, this->exec); x_ref = x->clone(); diff --git a/reference/test/solver/multigrid_kernels.cpp b/reference/test/solver/multigrid_kernels.cpp index 3b32d2a1235..be19cc6f29e 100644 --- a/reference/test/solver/multigrid_kernels.cpp +++ b/reference/test/solver/multigrid_kernels.cpp @@ -186,7 +186,7 @@ class DummyLinOpWithFactory { auto alpha_value = gko::as>(alpha)->at(0, 0); - gko::remove_complex scale = std::real(alpha_value); + gko::remove_complex scale = gko::real(alpha_value); global_step *= static_cast(scale); step.push_back(global_step); global_step++; diff --git a/test/base/device_matrix_data_kernels.cpp b/test/base/device_matrix_data_kernels.cpp index edb94ef0beb..0d211896f06 100644 --- a/test/base/device_matrix_data_kernels.cpp +++ b/test/base/device_matrix_data_kernels.cpp @@ -67,7 +67,7 @@ class DeviceMatrixData : public CommonTestFixture { 0, host_data.size[0] - 1); std::uniform_int_distribution col_distr( 0, host_data.size[1] - 1); - std::uniform_real_distribution> + std::uniform_real_distribution<> val_distr(1.0, 2.0); // add random entries for (int i = 0; i < 1000; i++) { diff --git a/test/factorization/par_ic_kernels.cpp b/test/factorization/par_ic_kernels.cpp index 6e907acaa37..25c7c6450af 100644 --- a/test/factorization/par_ic_kernels.cpp +++ b/test/factorization/par_ic_kernels.cpp @@ -73,7 +73,7 @@ class ParIc : public CommonTestFixture { mtx_l = gko::test::generate_random_lower_triangular_matrix( mtx_size[0], false, std::uniform_int_distribution(10, mtx_size[0]), - std::normal_distribution>(0, 10.0), + std::normal_distribution<>(0, 10.0), rand_engine, ref); dmtx_ani = Csr::create(exec); dmtx_l_ani = Csr::create(exec); diff --git a/test/factorization/par_ict_kernels.cpp b/test/factorization/par_ict_kernels.cpp index 6f85229bb8a..514abf1bbe3 100644 --- a/test/factorization/par_ict_kernels.cpp +++ b/test/factorization/par_ict_kernels.cpp @@ -79,15 +79,11 @@ class ParIct : public CommonTestFixture { mtx = gko::test::generate_random_matrix( mtx_size[0], mtx_size[1], std::uniform_int_distribution(10, mtx_size[1]), - std::normal_distribution>(-1.0, - 1.0), - rand_engine, ref); + std::normal_distribution<>(-1.0, 1.0), rand_engine, ref); mtx_l = gko::test::generate_random_lower_triangular_matrix( mtx_size[0], false, std::uniform_int_distribution(10, mtx_size[0]), - std::normal_distribution>(-1.0, - 1.0), - rand_engine, ref); + std::normal_distribution<>(-1.0, 1.0), rand_engine, ref); dmtx_ani = Csr::create(exec); dmtx_l_ani = Csr::create(exec); diff --git a/test/factorization/par_ilu_kernels.cpp b/test/factorization/par_ilu_kernels.cpp index a6c0706e6ee..480f62057b8 100644 --- a/test/factorization/par_ilu_kernels.cpp +++ b/test/factorization/par_ilu_kernels.cpp @@ -90,7 +90,7 @@ class ParIlu : public CommonTestFixture { return gko::test::generate_random_matrix( num_rows, num_cols, std::uniform_int_distribution(0, num_cols - 1), - std::normal_distribution>(0.0, 1.0), + std::normal_distribution<>(0.0, 1.0), rand_engine, ref); } diff --git a/test/factorization/par_ilut_kernels.cpp b/test/factorization/par_ilut_kernels.cpp index e457515e72c..b664b4c4108 100644 --- a/test/factorization/par_ilut_kernels.cpp +++ b/test/factorization/par_ilut_kernels.cpp @@ -80,39 +80,27 @@ class ParIlut : public CommonTestFixture { mtx1 = gko::test::generate_random_matrix( mtx_size[0], mtx_size[1], std::uniform_int_distribution(10, mtx_size[1]), - std::normal_distribution>(-1.0, - 1.0), - rand_engine, ref); + std::normal_distribution<>(-1.0, 1.0), rand_engine, ref); mtx2 = gko::test::generate_random_matrix( mtx_size[0], mtx_size[1], std::uniform_int_distribution(0, mtx_size[1]), - std::normal_distribution>(-1.0, - 1.0), - rand_engine, ref); + std::normal_distribution<>(-1.0, 1.0), rand_engine, ref); mtx_square = gko::test::generate_random_matrix( mtx_size[0], mtx_size[0], std::uniform_int_distribution(1, mtx_size[0]), - std::normal_distribution>(-1.0, - 1.0), - rand_engine, ref); + std::normal_distribution<>(-1.0, 1.0), rand_engine, ref); mtx_l = gko::test::generate_random_lower_triangular_matrix( mtx_size[0], false, std::uniform_int_distribution(10, mtx_size[0]), - std::normal_distribution>(-1.0, - 1.0), - rand_engine, ref); + std::normal_distribution<>(-1.0, 1.0), rand_engine, ref); mtx_l2 = gko::test::generate_random_lower_triangular_matrix( mtx_size[0], true, std::uniform_int_distribution(1, mtx_size[0]), - std::normal_distribution>(-1.0, - 1.0), - rand_engine, ref); + std::normal_distribution<>(-1.0, 1.0), rand_engine, ref); mtx_u = gko::test::generate_random_upper_triangular_matrix( mtx_size[0], false, std::uniform_int_distribution(10, mtx_size[0]), - std::normal_distribution>(-1.0, - 1.0), - rand_engine, ref); + std::normal_distribution<>(-1.0, 1.0), rand_engine, ref); dmtx1 = gko::clone(exec, mtx1); dmtx2 = gko::clone(exec, mtx2); @@ -166,7 +154,7 @@ class ParIlut : public CommonTestFixture { const std::unique_ptr& dmtx, index_type rank) { double tolerance = - gko::is_complex() ? r::value : 0.0; + gko::is_complex() ? double(r::value) : 0.0; auto size = index_type(mtx->get_num_stored_elements()); using ValueType = typename Mtx::value_type; @@ -221,7 +209,7 @@ class ParIlut : public CommonTestFixture { const std::unique_ptr& dmtx, index_type rank) { double tolerance = - gko::is_complex() ? r::value : 0.0; + gko::is_complex() ? double(r::value) : 0.0; auto res = Mtx::create(ref, mtx_size); auto dres = Mtx::create(exec, mtx_size); auto res_coo = Coo::create(ref, mtx_size); diff --git a/test/matrix/fbcsr_kernels.cpp b/test/matrix/fbcsr_kernels.cpp index ee32c52a358..b593e075b14 100644 --- a/test/matrix/fbcsr_kernels.cpp +++ b/test/matrix/fbcsr_kernels.cpp @@ -69,7 +69,7 @@ class Fbcsr : public CommonTestFixture { std::unique_ptr rsorted; - std::normal_distribution> distb; + std::normal_distribution<> distb; std::default_random_engine engine; value_type get_random_value() @@ -83,7 +83,10 @@ class Fbcsr : public CommonTestFixture { for (index_type i = 0; i < x->get_size()[0] * x->get_size()[1]; i++) { xarr[i] = static_cast(2.0) * - std::sin(static_cast(i / 2.0) + get_random_value()); + static_cast( + std::sin(static_cast< + typename gko::detail::arth_type::type>( + static_cast(i / 2.0) + get_random_value()))); } } }; diff --git a/test/solver/direct.cpp b/test/solver/direct.cpp index 0a30f7ba67f..c1c14901a56 100644 --- a/test/solver/direct.cpp +++ b/test/solver/direct.cpp @@ -82,9 +82,7 @@ class Direct : public CommonTestFixture { return gko::test::generate_random_matrix( num_rows, num_cols, std::uniform_int_distribution<>(num_cols, num_cols), - std::normal_distribution>(-1.0, - 1.0), - rand_engine, ref); + std::normal_distribution<>(-1.0, 1.0), rand_engine, ref); } void initialize_data(const char* mtx_filename, int nrhs) From d450dc5bfdf3ed3b8f5422454022b2d0683dd65a Mon Sep 17 00:00:00 2001 From: "Yuhsiang M. Tsai" Date: Mon, 27 Mar 2023 23:10:12 +0200 Subject: [PATCH 26/48] WIP fix half of failed test --- core/base/mtx_io.cpp | 4 ++ core/test/base/extended_float.cpp | 18 +++++- core/test/log/stream.cpp | 70 +++++++++++------------ core/test/solver/gcr.cpp | 6 +- core/test/utils.hpp | 11 +++- core/test/utils/matrix_generator_test.cpp | 12 ++-- core/test/utils/matrix_utils_test.cpp | 4 +- cuda/base/types.hpp | 4 +- include/ginkgo/core/base/half.hpp | 24 +++++--- include/ginkgo/core/base/math.hpp | 3 +- reference/test/matrix/coo_kernels.cpp | 2 + reference/test/solver/bicg_kernels.cpp | 4 ++ reference/test/solver/cg_kernels.cpp | 6 ++ reference/test/solver/fcg_kernels.cpp | 6 ++ reference/test/solver/gmres_kernels.cpp | 7 +++ test/components/reduce_array_kernels.cpp | 4 +- test/matrix/fbcsr_kernels.cpp | 12 ++++ test/matrix/fft_kernels.cpp | 2 +- 18 files changed, 139 insertions(+), 60 deletions(-) diff --git a/core/base/mtx_io.cpp b/core/base/mtx_io.cpp index 660b3bea313..2616feb0530 100644 --- a/core/base/mtx_io.cpp +++ b/core/base/mtx_io.cpp @@ -910,12 +910,16 @@ matrix_data read_binary_raw(std::istream& is) } DECLARE_OVERLOAD(double, int32) DECLARE_OVERLOAD(float, int32) + DECLARE_OVERLOAD(half, int32) DECLARE_OVERLOAD(std::complex, int32) DECLARE_OVERLOAD(std::complex, int32) + DECLARE_OVERLOAD(std::complex, int32) DECLARE_OVERLOAD(double, int64) DECLARE_OVERLOAD(float, int64) + DECLARE_OVERLOAD(half, int64) DECLARE_OVERLOAD(std::complex, int64) DECLARE_OVERLOAD(std::complex, int64) + DECLARE_OVERLOAD(std::complex, int64) #undef DECLARE_OVERLOAD else { diff --git a/core/test/base/extended_float.cpp b/core/test/base/extended_float.cpp index 6098a70b728..dda19bd087a 100644 --- a/core/test/base/extended_float.cpp +++ b/core/test/base/extended_float.cpp @@ -38,7 +38,8 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include - +#include +#include "ginkgo/core/base/math.hpp" namespace { @@ -222,6 +223,21 @@ TEST_F(FloatToHalf, TruncatesLargeNumberRoundToEven) } +TEST_F(FloatToHalf, Convert) +{ + float rho = 86.25; + float beta = 1110; + auto float_res = rho/beta; + gko::half rho_h = rho; + gko::half beta_h = beta; + auto half_res = rho_h/beta_h; + std::cout << float_res << std::endl; + std::cout << float(half_res) << std::endl; + + std::complex cpx{100.0, 0.0}; + std::cout << float(gko::squared_norm(cpx)) << std::endl; +} + // clang-format on diff --git a/core/test/log/stream.cpp b/core/test/log/stream.cpp index 3558a7d5564..82c5d831b70 100644 --- a/core/test/log/stream.cpp +++ b/core/test/log/stream.cpp @@ -413,17 +413,17 @@ TYPED_TEST(Stream, CatchesLinOpApplyStartedWithVerbose) std::stringstream out; auto logger = gko::log::Stream::create( gko::log::Logger::linop_apply_started_mask, out, true); - auto A = gko::initialize({1.1}, exec); - auto b = gko::initialize({-2.2}, exec); - auto x = gko::initialize({3.3}, exec); + auto A = gko::initialize({1.5}, exec); + auto b = gko::initialize({-2.25}, exec); + auto x = gko::initialize({3.125}, exec); logger->template on(A.get(), b.get(), x.get()); auto os = out.str(); - GKO_ASSERT_STR_CONTAINS(os, "1.1"); - GKO_ASSERT_STR_CONTAINS(os, "-2.2"); - GKO_ASSERT_STR_CONTAINS(os, "3.3"); + GKO_ASSERT_STR_CONTAINS(os, "1.5"); + GKO_ASSERT_STR_CONTAINS(os, "-2.25"); + GKO_ASSERT_STR_CONTAINS(os, "3.125"); } @@ -462,17 +462,17 @@ TYPED_TEST(Stream, CatchesLinOpApplyCompletedWithVerbose) std::stringstream out; auto logger = gko::log::Stream::create( gko::log::Logger::linop_apply_completed_mask, out, true); - auto A = gko::initialize({1.1}, exec); - auto b = gko::initialize({-2.2}, exec); - auto x = gko::initialize({3.3}, exec); + auto A = gko::initialize({1.5}, exec); + auto b = gko::initialize({-2.25}, exec); + auto x = gko::initialize({3.125}, exec); logger->template on( A.get(), b.get(), x.get()); auto os = out.str(); - GKO_ASSERT_STR_CONTAINS(os, "1.1"); - GKO_ASSERT_STR_CONTAINS(os, "-2.2"); - GKO_ASSERT_STR_CONTAINS(os, "3.3"); + GKO_ASSERT_STR_CONTAINS(os, "1.5"); + GKO_ASSERT_STR_CONTAINS(os, "-2.25"); + GKO_ASSERT_STR_CONTAINS(os, "3.125"); } @@ -519,21 +519,21 @@ TYPED_TEST(Stream, CatchesLinOpAdvancedApplyStartedWithVerbose) std::stringstream out; auto logger = gko::log::Stream::create( gko::log::Logger::linop_advanced_apply_started_mask, out, true); - auto A = gko::initialize({1.1}, exec); - auto alpha = gko::initialize({-4.4}, exec); - auto b = gko::initialize({-2.2}, exec); + auto A = gko::initialize({1.5}, exec); + auto alpha = gko::initialize({-4.75}, exec); + auto b = gko::initialize({-2.25}, exec); auto beta = gko::initialize({-5.5}, exec); - auto x = gko::initialize({3.3}, exec); + auto x = gko::initialize({3.125}, exec); logger->template on( A.get(), alpha.get(), b.get(), beta.get(), x.get()); auto os = out.str(); - GKO_ASSERT_STR_CONTAINS(os, "1.1"); - GKO_ASSERT_STR_CONTAINS(os, "-4.4"); - GKO_ASSERT_STR_CONTAINS(os, "-2.2"); + GKO_ASSERT_STR_CONTAINS(os, "1.5"); + GKO_ASSERT_STR_CONTAINS(os, "-4.75"); + GKO_ASSERT_STR_CONTAINS(os, "-2.25"); GKO_ASSERT_STR_CONTAINS(os, "-5.5"); - GKO_ASSERT_STR_CONTAINS(os, "3.3"); + GKO_ASSERT_STR_CONTAINS(os, "3.125"); } @@ -580,21 +580,21 @@ TYPED_TEST(Stream, CatchesLinOpAdvancedApplyCompletedWithVerbose) std::stringstream out; auto logger = gko::log::Stream::create( gko::log::Logger::linop_advanced_apply_completed_mask, out, true); - auto A = gko::initialize({1.1}, exec); - auto alpha = gko::initialize({-4.4}, exec); - auto b = gko::initialize({-2.2}, exec); + auto A = gko::initialize({1.5}, exec); + auto alpha = gko::initialize({-4.75}, exec); + auto b = gko::initialize({-2.25}, exec); auto beta = gko::initialize({-5.5}, exec); - auto x = gko::initialize({3.3}, exec); + auto x = gko::initialize({3.125}, exec); logger->template on( A.get(), alpha.get(), b.get(), beta.get(), x.get()); auto os = out.str(); - GKO_ASSERT_STR_CONTAINS(os, "1.1"); - GKO_ASSERT_STR_CONTAINS(os, "-4.4"); - GKO_ASSERT_STR_CONTAINS(os, "-2.2"); + GKO_ASSERT_STR_CONTAINS(os, "1.5"); + GKO_ASSERT_STR_CONTAINS(os, "-4.75"); + GKO_ASSERT_STR_CONTAINS(os, "-2.25"); GKO_ASSERT_STR_CONTAINS(os, "-5.5"); - GKO_ASSERT_STR_CONTAINS(os, "3.3"); + GKO_ASSERT_STR_CONTAINS(os, "3.125"); } @@ -818,11 +818,11 @@ TYPED_TEST(Stream, CatchesIterationsWithVerbose) .with_criteria( gko::stop::Iteration::build().with_max_iters(3u).on(exec)) .on(exec); - auto solver = factory->generate(gko::initialize({1.1}, exec)); + auto solver = factory->generate(gko::initialize({1.25}, exec)); auto right_hand_side = gko::initialize({-5.5}, exec); - auto residual = gko::initialize({-4.4}, exec); - auto solution = gko::initialize({-2.2}, exec); - auto residual_norm = gko::initialize({-3.3}, exec); + auto residual = gko::initialize({-4.5}, exec); + auto solution = gko::initialize({-2.25}, exec); + auto residual_norm = gko::initialize({-3.125}, exec); gko::array stop_status(exec, 1); logger->template on( @@ -831,9 +831,9 @@ TYPED_TEST(Stream, CatchesIterationsWithVerbose) auto os = out.str(); GKO_ASSERT_STR_CONTAINS(os, "-5.5"); - GKO_ASSERT_STR_CONTAINS(os, "-4.4"); - GKO_ASSERT_STR_CONTAINS(os, "-2.2"); - GKO_ASSERT_STR_CONTAINS(os, "-3.3"); + GKO_ASSERT_STR_CONTAINS(os, "-4.5"); + GKO_ASSERT_STR_CONTAINS(os, "-2.25"); + GKO_ASSERT_STR_CONTAINS(os, "-3.125"); GKO_ASSERT_STR_CONTAINS(os, "Finalized:") } diff --git a/core/test/solver/gcr.cpp b/core/test/solver/gcr.cpp index f7ba80ebba1..fd4053617fd 100644 --- a/core/test/solver/gcr.cpp +++ b/core/test/solver/gcr.cpp @@ -60,8 +60,7 @@ class Gcr : public ::testing::Test { using Solver = gko::solver::Gcr; using Big_solver = gko::solver::Gcr; - static constexpr gko::remove_complex reduction_factor = - gko::remove_complex(1e-6); + static const gko::remove_complex reduction_factor; Gcr() : exec(gko::ReferenceExecutor::create()), @@ -108,7 +107,8 @@ class Gcr : public ::testing::Test { }; template -constexpr gko::remove_complex Gcr::reduction_factor; +const gko::remove_complex Gcr::reduction_factor = + gko::remove_complex(1e-6); TYPED_TEST_SUITE(Gcr, gko::test::ValueTypes, TypenameNameGenerator); diff --git a/core/test/utils.hpp b/core/test/utils.hpp index 74aad6fe5c1..423f0972e36 100644 --- a/core/test/utils.hpp +++ b/core/test/utils.hpp @@ -339,7 +339,7 @@ struct next_precision_impl {}; template <> struct next_precision_impl { - using type = gko::half; + using type = float; }; template <> @@ -365,4 +365,13 @@ template using next_precision = typename detail::next_precision_impl::type; +#define SKIP_IF_HALF(type) \ + if (std::is_same, gko::half>::value) { \ + GTEST_SKIP() << "Skip due to single mode"; \ + } \ + static_assert(true, \ + "This assert is used to counter the false positive extra " \ + "semi-colon warnings") + + #endif // GKO_CORE_TEST_UTILS_HPP_ diff --git a/core/test/utils/matrix_generator_test.cpp b/core/test/utils/matrix_generator_test.cpp index c4820244231..dd3215f0ce9 100644 --- a/core/test/utils/matrix_generator_test.cpp +++ b/core/test/utils/matrix_generator_test.cpp @@ -51,6 +51,8 @@ template class MatrixGenerator : public ::testing::Test { protected: using value_type = T; + using check_type = + typename gko::detail::arth_type>::type; using real_type = gko::remove_complex; using mtx_type = gko::matrix::Dense; @@ -127,15 +129,15 @@ class MatrixGenerator : public ::testing::Test { template - ValueType get_nth_moment(int n, ValueType c, InputIterator sample_start, - InputIterator sample_end, Closure closure_op) + check_type get_nth_moment(int n, ValueType c, InputIterator sample_start, + InputIterator sample_end, Closure closure_op) { using std::pow; - ValueType res = 0; - ValueType num_elems = 0; + check_type res = 0; + check_type num_elems = 0; while (sample_start != sample_end) { auto tmp = *(sample_start++); - res += pow(closure_op(tmp) - c, n); + res += pow(check_type{closure_op(tmp)} - check_type{c}, n); num_elems += 1; } return res / num_elems; diff --git a/core/test/utils/matrix_utils_test.cpp b/core/test/utils/matrix_utils_test.cpp index 28512266ff1..1640d1310c6 100644 --- a/core/test/utils/matrix_utils_test.cpp +++ b/core/test/utils/matrix_utils_test.cpp @@ -273,7 +273,7 @@ TYPED_TEST(MatrixUtils, MakeHpdMatrixCorrectly) TYPED_TEST(MatrixUtils, MakeHpdMatrixWithRatioCorrectly) { using T = typename TestFixture::value_type; - gko::remove_complex ratio = 1.00001; + gko::remove_complex ratio = 1.002; auto cpy_data = this->data; gko::utils::make_hpd(this->data, ratio); @@ -308,7 +308,7 @@ TYPED_TEST(MatrixUtils, MakeSpdMatrixCorrectly) TYPED_TEST(MatrixUtils, MakeSpdMatrixWithRatioCorrectly) { using T = typename TestFixture::value_type; - gko::remove_complex ratio = 1.00001; + gko::remove_complex ratio = 1.002; auto cpy_data = this->data; gko::utils::make_spd(this->data, ratio); diff --git a/cuda/base/types.hpp b/cuda/base/types.hpp index dd4e2b0929a..0986ff38f59 100644 --- a/cuda/base/types.hpp +++ b/cuda/base/types.hpp @@ -72,7 +72,7 @@ namespace thrust { template <> GKO_ATTRIBUTES GKO_INLINE __half abs<__half>(const complex<__half>& z) { - return hypot(z.real(), z.imag()); + return abs(static_cast>(z)); } @@ -83,7 +83,7 @@ GKO_ATTRIBUTES GKO_INLINE __half abs<__half>(const complex<__half>& z) GKO_ATTRIBUTES GKO_INLINE thrust::complex<__half> operator _op( \ const thrust::complex<__half> lhs, const thrust::complex<__half> rhs) \ { \ - return thrust::complex{lhs} + thrust::complex(rhs); \ + return thrust::complex{lhs} _op thrust::complex(rhs); \ } THRUST_HALF_FRIEND_OPERATOR(+, +=) diff --git a/include/ginkgo/core/base/half.hpp b/include/ginkgo/core/base/half.hpp index 446d085754d..6b21646c66c 100644 --- a/include/ginkgo/core/base/half.hpp +++ b/include/ginkgo/core/base/half.hpp @@ -610,18 +610,28 @@ class complex { template complex& operator*=(const complex& val) { - auto tmp = real_; - real_ = real_ * val.real() - imag_ * val.imag(); - imag_ = tmp * val.imag() + imag_ * val.real(); + auto val_f = static_cast>(val); + auto result_f = static_cast>(*this); + result_f *= val_f; + real_ = result_f.real(); + imag_ = result_f.imag(); + // auto tmp = real_; + // real_ = real_ * val.real() - imag_ * val.imag(); + // imag_ = tmp * val.imag() + imag_ * val.real(); return *this; } template complex& operator/=(const complex& val) { - auto real = val.real(); - auto imag = val.imag(); - (*this) *= complex{val.real(), -val.imag()}; - (*this) /= (real * real + imag * imag); + // auto real = val.real(); + // auto imag = val.imag(); + // (*this) *= complex{val.real(), -val.imag()}; + // (*this) /= (real * real + imag * imag); + auto val_f = static_cast>(val); + auto result_f = static_cast>(*this); + result_f /= val_f; + real_ = result_f.real(); + imag_ = result_f.imag(); return *this; } diff --git a/include/ginkgo/core/base/math.hpp b/include/ginkgo/core/base/math.hpp index 55f4761dee2..4aa4a70de0e 100644 --- a/include/ginkgo/core/base/math.hpp +++ b/include/ginkgo/core/base/math.hpp @@ -73,7 +73,8 @@ inline gko::half abs(gko::half a) { return gko::half((a > 0) ? a : -a); } inline gko::half abs(std::complex a) { - return gko::half(sqrt(float(a.real() * a.real() + a.imag() * a.imag()))); + // Using float abs not sqrt on norm to avoid overflow + return gko::half(abs(std::complex(a))); } diff --git a/reference/test/matrix/coo_kernels.cpp b/reference/test/matrix/coo_kernels.cpp index 8c5d88ea488..5c606dcae16 100644 --- a/reference/test/matrix/coo_kernels.cpp +++ b/reference/test/matrix/coo_kernels.cpp @@ -149,6 +149,8 @@ TYPED_TEST(Coo, MovesToPrecision) this->mtx->move_to(tmp); tmp->move_to(res); + // TODO: When use move_to to the different precision, it will keep the + // original data GKO_ASSERT_MTX_NEAR(this->mtx, res, residual); } diff --git a/reference/test/solver/bicg_kernels.cpp b/reference/test/solver/bicg_kernels.cpp index eafba6ca123..63c5c4a1704 100644 --- a/reference/test/solver/bicg_kernels.cpp +++ b/reference/test/solver/bicg_kernels.cpp @@ -487,6 +487,7 @@ TYPED_TEST(Bicg, SolvesBigDenseSystem1) { using Mtx = typename TestFixture::Mtx; using value_type = typename TestFixture::value_type; + SKIP_IF_HALF(value_type); auto solver = this->bicg_factory_big->generate(this->mtx_big); auto b = gko::initialize( {1300083.0, 1018120.5, 906410.0, -42679.5, 846779.5, 1176858.5}, @@ -504,6 +505,7 @@ TYPED_TEST(Bicg, SolvesBigDenseSystem2) { using Mtx = typename TestFixture::Mtx; using value_type = typename TestFixture::value_type; + SKIP_IF_HALF(value_type); auto solver = this->bicg_factory_big->generate(this->mtx_big); auto b = gko::initialize( {886630.5, -172578.0, 684522.0, -65310.5, 455487.5, 607436.0}, @@ -521,6 +523,7 @@ TYPED_TEST(Bicg, SolvesBigDenseSystemImplicitResNormCrit) { using Mtx = typename TestFixture::Mtx; using value_type = typename TestFixture::value_type; + SKIP_IF_HALF(value_type); auto solver = this->bicg_factory_big2->generate(this->mtx_big); auto b = gko::initialize( {886630.5, -172578.0, 684522.0, -65310.5, 455487.5, 607436.0}, @@ -552,6 +555,7 @@ TYPED_TEST(Bicg, SolvesMultipleDenseSystemForDivergenceCheck) { using Mtx = typename TestFixture::Mtx; using value_type = typename TestFixture::value_type; + SKIP_IF_HALF(value_type); auto solver = this->bicg_factory_big->generate(this->mtx_big); auto b1 = gko::initialize( {1300083.0, 1018120.5, 906410.0, -42679.5, 846779.5, 1176858.5}, diff --git a/reference/test/solver/cg_kernels.cpp b/reference/test/solver/cg_kernels.cpp index ffe594625ef..b97fe563a30 100644 --- a/reference/test/solver/cg_kernels.cpp +++ b/reference/test/solver/cg_kernels.cpp @@ -448,6 +448,7 @@ TYPED_TEST(Cg, SolvesBigDenseSystem1) { using Mtx = typename TestFixture::Mtx; using value_type = typename TestFixture::value_type; + SKIP_IF_HALF(value_type); auto solver = this->cg_factory_big->generate(this->mtx_big); auto b = gko::initialize( {1300083.0, 1018120.5, 906410.0, -42679.5, 846779.5, 1176858.5}, @@ -465,6 +466,7 @@ TYPED_TEST(Cg, SolvesBigDenseSystem2) { using Mtx = typename TestFixture::Mtx; using value_type = typename TestFixture::value_type; + SKIP_IF_HALF(value_type); auto solver = this->cg_factory_big->generate(this->mtx_big); auto b = gko::initialize( {886630.5, -172578.0, 684522.0, -65310.5, 455487.5, 607436.0}, @@ -482,6 +484,7 @@ TYPED_TEST(Cg, SolvesBigDenseSystem3) { using Mtx = typename TestFixture::Mtx; using value_type = typename TestFixture::value_type; + SKIP_IF_HALF(value_type); auto solver = this->cg_factory_big2->generate(this->mtx_big); auto b = gko::initialize( {886630.5, -172578.0, 684522.0, -65310.5, 455487.5, 607436.0}, @@ -499,6 +502,7 @@ TYPED_TEST(Cg, SolvesMultipleDenseSystemForDivergenceCheck) { using Mtx = typename TestFixture::Mtx; using value_type = typename TestFixture::value_type; + SKIP_IF_HALF(value_type); auto solver = this->cg_factory_big->generate(this->mtx_big); auto b1 = gko::initialize( {1300083.0, 1018120.5, 906410.0, -42679.5, 846779.5, 1176858.5}, @@ -567,6 +571,7 @@ TYPED_TEST(Cg, SolvesTransposedBigDenseSystem) { using Mtx = typename TestFixture::Mtx; using value_type = typename TestFixture::value_type; + SKIP_IF_HALF(value_type); auto solver = this->cg_factory_big->generate(this->mtx_big); auto b = gko::initialize( {1300083.0, 1018120.5, 906410.0, -42679.5, 846779.5, 1176858.5}, @@ -584,6 +589,7 @@ TYPED_TEST(Cg, SolvesConjTransposedBigDenseSystem) { using Mtx = typename TestFixture::Mtx; using value_type = typename TestFixture::value_type; + SKIP_IF_HALF(value_type); auto solver = this->cg_factory_big->generate(this->mtx_big); auto b = gko::initialize( {1300083.0, 1018120.5, 906410.0, -42679.5, 846779.5, 1176858.5}, diff --git a/reference/test/solver/fcg_kernels.cpp b/reference/test/solver/fcg_kernels.cpp index 8a3a796c60a..e5803bfdc22 100644 --- a/reference/test/solver/fcg_kernels.cpp +++ b/reference/test/solver/fcg_kernels.cpp @@ -461,6 +461,7 @@ TYPED_TEST(Fcg, SolvesBigDenseSystem1) { using Mtx = typename TestFixture::Mtx; using value_type = typename TestFixture::value_type; + SKIP_IF_HALF(value_type); auto solver = this->fcg_factory_big->generate(this->mtx_big); auto b = gko::initialize( {1300083.0, 1018120.5, 906410.0, -42679.5, 846779.5, 1176858.5}, @@ -478,6 +479,7 @@ TYPED_TEST(Fcg, SolvesBigDenseSystemWithImplicitResNormCrit) { using Mtx = typename TestFixture::Mtx; using value_type = typename TestFixture::value_type; + SKIP_IF_HALF(value_type); auto solver = this->fcg_factory_big2->generate(this->mtx_big); auto b = gko::initialize( {886630.5, -172578.0, 684522.0, -65310.5, 455487.5, 607436.0}, @@ -495,6 +497,7 @@ TYPED_TEST(Fcg, SolvesBigDenseSystem2) { using Mtx = typename TestFixture::Mtx; using value_type = typename TestFixture::value_type; + SKIP_IF_HALF(value_type); auto solver = this->fcg_factory_big->generate(this->mtx_big); auto b = gko::initialize( {886630.5, -172578.0, 684522.0, -65310.5, 455487.5, 607436.0}, @@ -512,6 +515,7 @@ TYPED_TEST(Fcg, SolvesMultipleBigDenseSystems) { using Mtx = typename TestFixture::Mtx; using value_type = typename TestFixture::value_type; + SKIP_IF_HALF(value_type); auto solver = this->fcg_factory_big->generate(this->mtx_big); auto b1 = gko::initialize( {1300083.0, 1018120.5, 906410.0, -42679.5, 846779.5, 1176858.5}, @@ -580,6 +584,7 @@ TYPED_TEST(Fcg, SolvesTransposedBigDenseSystem) { using Mtx = typename TestFixture::Mtx; using value_type = typename TestFixture::value_type; + SKIP_IF_HALF(value_type); auto solver = this->fcg_factory_big->generate(this->mtx_big); auto b = gko::initialize( {1300083.0, 1018120.5, 906410.0, -42679.5, 846779.5, 1176858.5}, @@ -597,6 +602,7 @@ TYPED_TEST(Fcg, SolvesConjTransposedBigDenseSystem) { using Mtx = typename TestFixture::Mtx; using value_type = typename TestFixture::value_type; + SKIP_IF_HALF(value_type); auto solver = this->fcg_factory_big->generate(this->mtx_big); auto b = gko::initialize( {1300083.0, 1018120.5, 906410.0, -42679.5, 846779.5, 1176858.5}, diff --git a/reference/test/solver/gmres_kernels.cpp b/reference/test/solver/gmres_kernels.cpp index 47cc8f0476a..002245c523c 100644 --- a/reference/test/solver/gmres_kernels.cpp +++ b/reference/test/solver/gmres_kernels.cpp @@ -606,6 +606,7 @@ TYPED_TEST(Gmres, SolvesBigDenseSystem1) { using Mtx = typename TestFixture::Mtx; using value_type = typename TestFixture::value_type; + SKIP_IF_HALF(value_type); auto solver = this->gmres_factory_big->generate(this->mtx_big); auto b = gko::initialize( {72748.36, 297469.88, 347229.24, 36290.66, 82958.82, -80192.15}, @@ -623,6 +624,7 @@ TYPED_TEST(Gmres, SolvesBigDenseSystem2) { using Mtx = typename TestFixture::Mtx; using value_type = typename TestFixture::value_type; + SKIP_IF_HALF(value_type); auto solver = this->gmres_factory_big->generate(this->mtx_big); auto b = gko::initialize( {175352.10, 313410.50, 131114.10, -134116.30, 179529.30, -43564.90}, @@ -640,6 +642,7 @@ TYPED_TEST(Gmres, SolveWithImplicitResNormCritIsDisabled) { using Mtx = typename TestFixture::Mtx; using value_type = typename TestFixture::value_type; + SKIP_IF_HALF(value_type); auto solver = this->gmres_factory_big2->generate(this->mtx_big); auto b = gko::initialize( {175352.10, 313410.50, 131114.10, -134116.30, 179529.30, -43564.90}, @@ -654,6 +657,7 @@ TYPED_TEST(Gmres, SolvesMultipleDenseSystemForDivergenceCheck) { using Mtx = typename TestFixture::Mtx; using value_type = typename TestFixture::value_type; + SKIP_IF_HALF(value_type); auto solver = this->gmres_factory_big->generate(this->mtx_big); auto b1 = gko::initialize( {1300083.0, 1018120.5, 906410.0, -42679.5, 846779.5, 1176858.5}, @@ -748,6 +752,7 @@ TYPED_TEST(Gmres, SolvesWithPreconditioner) using Mtx = typename TestFixture::Mtx; using Solver = typename TestFixture::Solver; using value_type = typename TestFixture::value_type; + SKIP_IF_HALF(value_type); auto gmres_factory_preconditioner = Solver::build() .with_criteria( @@ -778,6 +783,7 @@ TYPED_TEST(Gmres, SolvesTransposedBigDenseSystem) { using Mtx = typename TestFixture::Mtx; using value_type = typename TestFixture::value_type; + SKIP_IF_HALF(value_type); auto solver = this->gmres_factory_big->generate(this->mtx_big->transpose()); auto b = gko::initialize( {72748.36, 297469.88, 347229.24, 36290.66, 82958.82, -80192.15}, @@ -795,6 +801,7 @@ TYPED_TEST(Gmres, SolvesConjTransposedBigDenseSystem) { using Mtx = typename TestFixture::Mtx; using value_type = typename TestFixture::value_type; + SKIP_IF_HALF(value_type); auto solver = this->gmres_factory_big->generate(this->mtx_big->conj_transpose()); auto b = gko::initialize( diff --git a/test/components/reduce_array_kernels.cpp b/test/components/reduce_array_kernels.cpp index 6738125ded6..90f9d532f7a 100644 --- a/test/components/reduce_array_kernels.cpp +++ b/test/components/reduce_array_kernels.cpp @@ -53,13 +53,13 @@ class ReduceArray : public CommonTestFixture { protected: using value_type = T; ReduceArray() - : total_size(6355), + : total_size(1024), out{ref, I{2}}, dout{exec, out}, vals{ref, total_size}, dvals{exec} { - std::fill_n(vals.get_data(), total_size, 3); + std::fill_n(vals.get_data(), total_size, 1); dvals = vals; } diff --git a/test/matrix/fbcsr_kernels.cpp b/test/matrix/fbcsr_kernels.cpp index b593e075b14..6e0fbe555dc 100644 --- a/test/matrix/fbcsr_kernels.cpp +++ b/test/matrix/fbcsr_kernels.cpp @@ -158,6 +158,9 @@ TYPED_TEST(Fbcsr, SpmvIsEquivalentToRefSorted) using Mtx = typename TestFixture::Mtx; using Dense = typename TestFixture::Dense; using value_type = typename Mtx::value_type; + if (this->exec->get_master() != this->exec) { + SKIP_IF_HALF(value_type); + } auto drand = gko::clone(this->exec, this->rsorted); auto x = Dense::create(this->ref, gko::dim<2>(this->rsorted->get_size()[1], 1)); @@ -180,6 +183,9 @@ TYPED_TEST(Fbcsr, SpmvMultiIsEquivalentToRefSorted) using Mtx = typename TestFixture::Mtx; using Dense = typename TestFixture::Dense; using value_type = typename Mtx::value_type; + if (this->exec->get_master() != this->exec) { + SKIP_IF_HALF(value_type); + } auto drand = gko::clone(this->exec, this->rsorted); auto x = Dense::create(this->ref, gko::dim<2>(this->rsorted->get_size()[1], 3)); @@ -203,6 +209,9 @@ TYPED_TEST(Fbcsr, AdvancedSpmvIsEquivalentToRefSorted) using Dense = typename TestFixture::Dense; using value_type = typename TestFixture::value_type; using real_type = typename TestFixture::real_type; + if (this->exec->get_master() != this->exec) { + SKIP_IF_HALF(value_type); + } auto drand = gko::clone(this->exec, this->rsorted); auto x = Dense::create(this->ref, gko::dim<2>(this->rsorted->get_size()[1], 1)); @@ -233,6 +242,9 @@ TYPED_TEST(Fbcsr, AdvancedSpmvMultiIsEquivalentToRefSorted) using Dense = typename TestFixture::Dense; using value_type = typename TestFixture::value_type; using real_type = typename TestFixture::real_type; + if (this->exec->get_master() != this->exec) { + SKIP_IF_HALF(value_type); + } auto drand = gko::clone(this->exec, this->rsorted); auto x = Dense::create(this->ref, gko::dim<2>(this->rsorted->get_size()[1], 3)); diff --git a/test/matrix/fft_kernels.cpp b/test/matrix/fft_kernels.cpp index fd9dda821c0..b598bb69bce 100644 --- a/test/matrix/fft_kernels.cpp +++ b/test/matrix/fft_kernels.cpp @@ -122,7 +122,7 @@ class Fft : public CommonTestFixture { }; -TYPED_TEST_SUITE(Fft, gko::test::ComplexValueTypes, TypenameNameGenerator); +TYPED_TEST_SUITE(Fft, gko::test::ComplexValueTypesNoHalf, TypenameNameGenerator); TYPED_TEST(Fft, Apply1DIsEqualToReference) From e83d53df9ae7d8dd96793ff12cf5195c409e2514 Mon Sep 17 00:00:00 2001 From: "Yu-Hsiang M. Tsai" Date: Tue, 13 Jun 2023 16:52:41 +0200 Subject: [PATCH 27/48] fix/skip half test and fix numeric_limit on device --- common/cuda_hip/base/math.hpp.inc | 7 +++ common/cuda_hip/matrix/csr_kernels.hpp.inc | 4 +- core/distributed/vector.cpp | 4 +- core/log/papi.cpp | 3 +- core/solver/idr.cpp | 9 ++++ core/test/base/extended_float.cpp | 6 ++- core/test/utils.hpp | 14 +++-- core/test/utils/matrix_generator_test.cpp | 13 +++-- cuda/CMakeLists.txt | 1 + cuda/matrix/fft_kernels.cu | 6 ++- dpcpp/matrix/csr_kernels.dp.cpp | 41 +++++++------- .../test/preconditioner/jacobi_kernels.dp.cpp | 21 ++++---- hip/test/matrix/fbcsr_kernels.cpp | 53 +++++++++++++------ include/ginkgo/core/base/half.hpp | 13 +++-- .../ginkgo/core/base/precision_dispatch.hpp | 3 +- omp/matrix/csr_kernels.cpp | 4 +- omp/matrix/fft_kernels.cpp | 6 ++- omp/solver/cb_gmres_kernels.cpp | 3 +- reference/matrix/csr_kernels.cpp | 4 +- reference/matrix/fft_kernels.cpp | 6 ++- reference/solver/cb_gmres_kernels.cpp | 3 +- reference/test/factorization/lu_kernels.cpp | 6 +-- .../test/preconditioner/isai_kernels.cpp | 7 +++ .../test/preconditioner/jacobi_kernels.cpp | 16 ++++-- reference/test/reorder/scaled_reordered.cpp | 8 +++ reference/test/solver/bicgstab_kernels.cpp | 19 ++++--- reference/test/solver/cgs_kernels.cpp | 10 ++++ reference/test/solver/gcr_kernels.cpp | 13 +++-- reference/test/solver/gmres_kernels.cpp | 1 + reference/test/solver/idr_kernels.cpp | 16 ++++-- test/base/device_matrix_data_kernels.cpp | 3 +- test/factorization/par_ic_kernels.cpp | 5 +- test/factorization/par_ict_kernels.cpp | 2 + test/factorization/par_ilu_kernels.cpp | 6 ++- test/factorization/par_ilut_kernels.cpp | 4 ++ test/matrix/fft_kernels.cpp | 3 +- test/mpi/matrix.cpp | 4 +- test/mpi/vector.cpp | 15 +++--- 38 files changed, 240 insertions(+), 122 deletions(-) diff --git a/common/cuda_hip/base/math.hpp.inc b/common/cuda_hip/base/math.hpp.inc index fa2850c10d6..39ab5e8baf5 100644 --- a/common/cuda_hip/base/math.hpp.inc +++ b/common/cuda_hip/base/math.hpp.inc @@ -39,6 +39,13 @@ struct device_numeric_limits { static constexpr auto min = std::numeric_limits::min(); }; +template <> +struct device_numeric_limits<__half> { + static constexpr auto inf = std::numeric_limits::infinity(); + static constexpr auto max = std::numeric_limits::max(); + static constexpr auto min = std::numeric_limits::min(); +}; + namespace detail { diff --git a/common/cuda_hip/matrix/csr_kernels.hpp.inc b/common/cuda_hip/matrix/csr_kernels.hpp.inc index 3f02337747e..a6f6269a0b4 100644 --- a/common/cuda_hip/matrix/csr_kernels.hpp.inc +++ b/common/cuda_hip/matrix/csr_kernels.hpp.inc @@ -222,7 +222,7 @@ __global__ __launch_bounds__(spmv_block_size) void abstract_spmv( { using arithmetic_type = typename output_accessor::arithmetic_type; using output_type = typename output_accessor::storage_type; - const arithmetic_type scale_factor = alpha[0]; + const arithmetic_type scale_factor = static_cast(alpha[0]); spmv_kernel(nwarps, num_rows, val, col_idxs, row_ptrs, srow, b, c, [&scale_factor](const arithmetic_type& x) { return static_cast(scale_factor * x); @@ -430,7 +430,7 @@ __global__ __launch_bounds__(spmv_block_size) void abstract_reduce( const IndexType* __restrict__ last_row, const MatrixValueType* __restrict__ alpha, acc::range c) { - const arithmetic_type alpha_val = alpha[0]; + const arithmetic_type alpha_val = static_cast(alpha[0]); merge_path_reduce( nwarps, last_val, last_row, c, [&alpha_val](const arithmetic_type& x) { return alpha_val * x; }); diff --git a/core/distributed/vector.cpp b/core/distributed/vector.cpp index 81b9c96bcfc..4251ad43f06 100644 --- a/core/distributed/vector.cpp +++ b/core/distributed/vector.cpp @@ -600,8 +600,8 @@ ValueType& Vector::at_local(size_type row, size_type col) noexcept } template -ValueType Vector::at_local(size_type row, size_type col) const - noexcept +ValueType Vector::at_local(size_type row, + size_type col) const noexcept { return local_.at(row, col); } diff --git a/core/log/papi.cpp b/core/log/papi.cpp index 3d98e62d0d0..e9bb541e4b6 100644 --- a/core/log/papi.cpp +++ b/core/log/papi.cpp @@ -249,8 +249,7 @@ void Papi::on_criterion_check_completed( auto tmp_res_norm = Vector::create( residual->get_executor(), dim<2>{1, residual->get_size()[1]}); dense_r->compute_norm2(tmp_res_norm); - residual_norm_d = - static_cast(real(tmp_res_norm->at(0, 0))); + residual_norm_d = static_cast(real(tmp_res_norm->at(0, 0))); }); } diff --git a/core/solver/idr.cpp b/core/solver/idr.cpp index 52b1eddc11f..4dd5c0fc260 100644 --- a/core/solver/idr.cpp +++ b/core/solver/idr.cpp @@ -37,6 +37,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #include +#include #include #include @@ -70,6 +71,10 @@ std::unique_ptr Idr::transpose() const .with_generated_preconditioner( share(as(this->get_preconditioner())->transpose())) .with_criteria(this->get_stop_criterion_factory()) + .with_subspace_dim(this->get_subspace_dim()) + .with_kappa(this->get_kappa()) + .with_deterministic(this->get_deterministic()) + .with_complex_subspace(this->get_complex_subspace()) .on(this->get_executor()) ->generate( share(as(this->get_system_matrix())->transpose())); @@ -83,6 +88,10 @@ std::unique_ptr Idr::conj_transpose() const .with_generated_preconditioner(share( as(this->get_preconditioner())->conj_transpose())) .with_criteria(this->get_stop_criterion_factory()) + .with_subspace_dim(this->get_subspace_dim()) + .with_kappa(this->get_kappa()) + .with_deterministic(this->get_deterministic()) + .with_complex_subspace(this->get_complex_subspace()) .on(this->get_executor()) ->generate(share( as(this->get_system_matrix())->conj_transpose())); diff --git a/core/test/base/extended_float.cpp b/core/test/base/extended_float.cpp index dda19bd087a..c8d7b450701 100644 --- a/core/test/base/extended_float.cpp +++ b/core/test/base/extended_float.cpp @@ -34,12 +34,14 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include #include #include -#include -#include "ginkgo/core/base/math.hpp" + + +#include namespace { diff --git a/core/test/utils.hpp b/core/test/utils.hpp index 423f0972e36..16e418d433e 100644 --- a/core/test/utils.hpp +++ b/core/test/utils.hpp @@ -65,12 +65,20 @@ namespace test { using ValueTypes = #if GINKGO_DPCPP_SINGLE_MODE - ::testing::Types>; + ::testing::Types, + std::complex>; #else ::testing::Types, std::complex, std::complex>; #endif +using ValueTypesNoHalf = +#if GINKGO_DPCPP_SINGLE_MODE + ::testing::Types>; +#else + ::testing::Types, std::complex>; +#endif + using ComplexValueTypes = #if GINKGO_DPCPP_SINGLE_MODE ::testing::Types, std::complex>; @@ -137,7 +145,6 @@ using ValueIndexTypes = std::tuple, std::tuple, gko::int32>, std::tuple, gko::int32>, - std::tuple, std::tuple, gko::int64>>; #else @@ -145,7 +152,6 @@ using ValueIndexTypes = std::tuple, std::tuple, std::tuple, std::tuple, gko::int32>, - std::tuple, gko::int32>, std::tuple, gko::int32>, std::tuple, gko::int32>, std::tuple, std::tuple, @@ -367,7 +373,7 @@ using next_precision = typename detail::next_precision_impl::type; #define SKIP_IF_HALF(type) \ if (std::is_same, gko::half>::value) { \ - GTEST_SKIP() << "Skip due to single mode"; \ + GTEST_SKIP() << "Skip due to half mode"; \ } \ static_assert(true, \ "This assert is used to counter the false positive extra " \ diff --git a/core/test/utils/matrix_generator_test.cpp b/core/test/utils/matrix_generator_test.cpp index dd3215f0ce9..e703647ce9e 100644 --- a/core/test/utils/matrix_generator_test.cpp +++ b/core/test/utils/matrix_generator_test.cpp @@ -280,7 +280,7 @@ TYPED_TEST(MatrixGenerator, CanGenerateTridiagMatrix) { using T = typename TestFixture::value_type; using Dense = typename TestFixture::mtx_type; - auto dist = std::normal_distribution>(0, 1); + auto dist = std::normal_distribution<>(0, 1); auto engine = std::default_random_engine(42); auto lower = gko::test::detail::get_rand_value(dist, engine); auto diag = gko::test::detail::get_rand_value(dist, engine); @@ -304,18 +304,23 @@ TYPED_TEST(MatrixGenerator, CanGenerateTridiagInverseMatrix) { using T = typename TestFixture::value_type; using Dense = typename TestFixture::mtx_type; - auto dist = std::normal_distribution>(0, 1); + auto dist = std::normal_distribution<>(0, 1); auto engine = std::default_random_engine(42); auto lower = gko::test::detail::get_rand_value(dist, engine); auto upper = gko::test::detail::get_rand_value(dist, engine); // make diagonally dominant auto diag = std::abs(gko::test::detail::get_rand_value(dist, engine)) + std::abs(lower) + std::abs(upper); + gko::size_type size = 50; + if (std::is_same>::value) { + // half precision can only handle small matrix + size = 5; + } auto mtx = gko::test::generate_tridiag_matrix( - 50, {lower, diag, upper}, this->exec); + size, {lower, diag, upper}, this->exec); auto inv_mtx = gko::test::generate_tridiag_inverse_matrix( - 50, {lower, diag, upper}, this->exec); + size, {lower, diag, upper}, this->exec); auto result = Dense::create(this->exec, mtx->get_size()); inv_mtx->apply(mtx, result); diff --git a/cuda/CMakeLists.txt b/cuda/CMakeLists.txt index 4c972d2a584..e9b3e1bd954 100644 --- a/cuda/CMakeLists.txt +++ b/cuda/CMakeLists.txt @@ -105,6 +105,7 @@ if(CMAKE_CUDA_COMPILER_ID STREQUAL "NVIDIA") target_compile_options(ginkgo_cuda PRIVATE $<$:--extended-lambda>) + target_compile_options(ginkgo_cuda PRIVATE -Xcompiler="/bigobj") else() target_compile_options(ginkgo_cuda PRIVATE diff --git a/cuda/matrix/fft_kernels.cu b/cuda/matrix/fft_kernels.cu index f9248df0125..8d1e32335cf 100644 --- a/cuda/matrix/fft_kernels.cu +++ b/cuda/matrix/fft_kernels.cu @@ -167,7 +167,8 @@ void fft2(std::shared_ptr exec, handle.execute(b->get_const_values(), x->get_values(), inverse); } -GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE_NO_HALF(GKO_DECLARE_FFT2_KERNEL); +GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE_NO_HALF( + GKO_DECLARE_FFT2_KERNEL); template @@ -183,7 +184,8 @@ void fft3(std::shared_ptr exec, handle.execute(b->get_const_values(), x->get_values(), inverse); } -GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE_NO_HALF(GKO_DECLARE_FFT3_KERNEL); +GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE_NO_HALF( + GKO_DECLARE_FFT3_KERNEL); } // namespace fft diff --git a/dpcpp/matrix/csr_kernels.dp.cpp b/dpcpp/matrix/csr_kernels.dp.cpp index 415acd2cdc8..11d583fc284 100644 --- a/dpcpp/matrix/csr_kernels.dp.cpp +++ b/dpcpp/matrix/csr_kernels.dp.cpp @@ -298,7 +298,7 @@ void abstract_spmv( { using arithmetic_type = typename output_accessor::arithmetic_type; using output_type = typename output_accessor::storage_type; - const arithmetic_type scale_factor = alpha[0]; + const arithmetic_type scale_factor = static_cast(alpha[0]); spmv_kernel( nwarps, num_rows, val, col_idxs, row_ptrs, srow, b, c, [&scale_factor](const arithmetic_type& x) { @@ -513,8 +513,8 @@ void abstract_merge_path_spmv( sycl::nd_item<3> item_ct1, IndexType* shared_row_ptrs) { using type = typename output_accessor::arithmetic_type; - const type alpha_val = alpha[0]; - const type beta_val = beta[0]; + const type alpha_val = static_cast(alpha[0]); + const type beta_val = static_cast(beta[0]); merge_path_spmv( num_rows, val, col_idxs, row_ptrs, srow, b, c, row_out, val_out, [&alpha_val](const type& x) { return alpha_val * x; }, @@ -605,7 +605,7 @@ void abstract_reduce( uninitialized_array& tmp_ind, uninitialized_array& tmp_val) { - const arithmetic_type alpha_val = alpha[0]; + const arithmetic_type alpha_val = static_cast(alpha[0]); merge_path_reduce( nwarps, last_val, last_row, c, [&alpha_val](const arithmetic_type& x) { return alpha_val * x; }, @@ -705,13 +705,13 @@ void abstract_classical_spmv( { if (subgroup_size > 1) { queue->submit([&](sycl::handler& cgh) { - cgh.parallel_for(sycl_nd_range(grid, block), - [=](sycl::nd_item<3> item_ct1) - [[sycl::reqd_sub_group_size(subgroup_size)]] { - abstract_classical_spmv( - num_rows, val, col_idxs, row_ptrs, b, - c, item_ct1); - }); + cgh.parallel_for( + sycl_nd_range(grid, block), [= + ](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size( + subgroup_size)]] { + abstract_classical_spmv( + num_rows, val, col_idxs, row_ptrs, b, c, item_ct1); + }); }); } else { queue->submit([&](sycl::handler& cgh) { @@ -736,8 +736,8 @@ void abstract_classical_spmv( acc::range c, sycl::nd_item<3> item_ct1) { using type = typename output_accessor::arithmetic_type; - const type alpha_val = alpha[0]; - const type beta_val = beta[0]; + const type alpha_val = static_cast(alpha[0]); + const type beta_val = static_cast(beta[0]); device_classical_spmv( num_rows, val, col_idxs, row_ptrs, b, c, [&alpha_val, &beta_val](const type& x, const type& y) { @@ -759,13 +759,14 @@ void abstract_classical_spmv( { if (subgroup_size > 1) { queue->submit([&](sycl::handler& cgh) { - cgh.parallel_for(sycl_nd_range(grid, block), - [=](sycl::nd_item<3> item_ct1) - [[sycl::reqd_sub_group_size(subgroup_size)]] { - abstract_classical_spmv( - num_rows, alpha, val, col_idxs, - row_ptrs, b, beta, c, item_ct1); - }); + cgh.parallel_for( + sycl_nd_range(grid, block), [= + ](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size( + subgroup_size)]] { + abstract_classical_spmv( + num_rows, alpha, val, col_idxs, row_ptrs, b, beta, c, + item_ct1); + }); }); } else { queue->submit([&](sycl::handler& cgh) { diff --git a/dpcpp/test/preconditioner/jacobi_kernels.dp.cpp b/dpcpp/test/preconditioner/jacobi_kernels.dp.cpp index aae15245357..a74a47f978e 100644 --- a/dpcpp/test/preconditioner/jacobi_kernels.dp.cpp +++ b/dpcpp/test/preconditioner/jacobi_kernels.dp.cpp @@ -95,7 +95,7 @@ class Jacobi : public ::testing::Test { if (condition_numbers.size() == 0) { mtx = gko::test::generate_random_matrix( dim, dim, std::uniform_int_distribution<>(min_nnz, max_nnz), - std::normal_distribution(0.0, 1.0), engine, ref); + std::normal_distribution<>(0.0, 1.0), engine, ref); } else { std::vector blocks; for (gko::size_type i = 0; i < block_pointers.size() - 1; ++i) { @@ -103,8 +103,7 @@ class Jacobi : public ::testing::Test { begin(block_pointers)[i + 1] - begin(block_pointers)[i]; const auto cond = begin(condition_numbers)[i]; blocks.push_back(mtx_data::cond( - size, cond, std::normal_distribution(-1, 1), - engine)); + size, cond, std::normal_distribution<>(-1, 1), engine)); } mtx = Mtx::create(ref); mtx->read(mtx_data::diag(begin(blocks), end(blocks))); @@ -140,11 +139,11 @@ class Jacobi : public ::testing::Test { } b = gko::test::generate_random_matrix( dim, num_rhs, std::uniform_int_distribution<>(num_rhs, num_rhs), - std::normal_distribution(0.0, 1.0), engine, ref); + std::normal_distribution<>(0.0, 1.0), engine, ref); d_b = gko::clone(dpcpp, b); x = gko::test::generate_random_matrix( dim, num_rhs, std::uniform_int_distribution<>(num_rhs, num_rhs), - std::normal_distribution(0.0, 1.0), engine, ref); + std::normal_distribution<>(0.0, 1.0), engine, ref); d_x = gko::clone(dpcpp, x); } @@ -442,7 +441,7 @@ TEST_F(Jacobi, DpcppScalarApplyEquivalentToRef) smtx->copy_from(dense_smtx); auto sb = gko::share(gko::test::generate_random_matrix( dim, 3, std::uniform_int_distribution<>(1, 1), - std::normal_distribution(0.0, 1.0), engine, ref)); + std::normal_distribution<>(0.0, 1.0), engine, ref)); auto sx = Vec::create(ref, sb->get_size()); auto d_smtx = gko::share(Mtx::create(dpcpp)); @@ -486,7 +485,7 @@ TEST_F(Jacobi, DpcppScalarLinearCombinationApplyEquivalentToRef) auto dense_data = gko::test::generate_random_matrix_data( dim, dim, std::uniform_int_distribution<>(1, dim), - std::normal_distribution(1.0, 2.0), engine); + std::normal_distribution<>(1.0, 2.0), engine); gko::utils::make_diag_dominant(dense_data); auto dense_smtx = gko::share(Vec::create(ref)); dense_smtx->read(dense_data); @@ -494,12 +493,12 @@ TEST_F(Jacobi, DpcppScalarLinearCombinationApplyEquivalentToRef) smtx->copy_from(dense_smtx); auto sb = gko::share(gko::test::generate_random_matrix( dim, 3, std::uniform_int_distribution<>(1, 1), - std::normal_distribution(0.0, 1.0), engine, ref, - gko::dim<2>(dim, 3), 4)); + std::normal_distribution<>(0.0, 1.0), engine, ref, gko::dim<2>(dim, 3), + 4)); auto sx = gko::share(gko::test::generate_random_matrix( dim, 3, std::uniform_int_distribution<>(1, 1), - std::normal_distribution(0.0, 1.0), engine, ref, - gko::dim<2>(dim, 3), 4)); + std::normal_distribution<>(0.0, 1.0), engine, ref, gko::dim<2>(dim, 3), + 4)); auto d_smtx = gko::share(gko::clone(dpcpp, smtx)); auto d_sb = gko::share(gko::clone(dpcpp, sb)); diff --git a/hip/test/matrix/fbcsr_kernels.cpp b/hip/test/matrix/fbcsr_kernels.cpp index c10544394e3..e8c87957c73 100644 --- a/hip/test/matrix/fbcsr_kernels.cpp +++ b/hip/test/matrix/fbcsr_kernels.cpp @@ -39,6 +39,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include #include @@ -73,7 +74,7 @@ class Fbcsr : public HipTestFixture { std::unique_ptr rsorted_ref; - std::normal_distribution> distb; + std::normal_distribution<> distb; std::default_random_engine engine; value_type get_random_value() @@ -177,11 +178,15 @@ TYPED_TEST(Fbcsr, SpmvIsEquivalentToRefSorted) this->ref, gko::dim<2>(this->rsorted_ref->get_size()[0], 1)); auto prod_hip = Dense::create(this->exec, prod_ref->get_size()); - rand_hip->apply(x_hip, prod_hip); - this->rsorted_ref->apply(x_ref, prod_ref); + if (std::is_same::value) { + ASSERT_THROW(rand_hip->apply(x_hip, prod_hip), gko::NotImplemented); + } else { + rand_hip->apply(x_hip, prod_hip); + this->rsorted_ref->apply(x_ref, prod_ref); - const double tol = r::value; - GKO_ASSERT_MTX_NEAR(prod_ref, prod_hip, 5 * tol); + const double tol = r::value; + GKO_ASSERT_MTX_NEAR(prod_ref, prod_hip, 5 * tol); + } } @@ -201,11 +206,15 @@ TYPED_TEST(Fbcsr, SpmvMultiIsEquivalentToRefSorted) this->ref, gko::dim<2>(this->rsorted_ref->get_size()[0], 3)); auto prod_hip = Dense::create(this->exec, prod_ref->get_size()); - rand_hip->apply(x_hip, prod_hip); - this->rsorted_ref->apply(x_ref, prod_ref); + if (std::is_same::value) { + ASSERT_THROW(rand_hip->apply(x_hip, prod_hip), gko::NotImplemented); + } else { + rand_hip->apply(x_hip, prod_hip); + this->rsorted_ref->apply(x_ref, prod_ref); - const double tol = r::value; - GKO_ASSERT_MTX_NEAR(prod_ref, prod_hip, 5 * tol); + const double tol = r::value; + GKO_ASSERT_MTX_NEAR(prod_ref, prod_hip, 5 * tol); + } } @@ -237,11 +246,16 @@ TYPED_TEST(Fbcsr, AdvancedSpmvIsEquivalentToRefSorted) auto beta = Dense::create(this->exec); beta->copy_from(beta_ref); - rand_hip->apply(alpha, x_hip, beta, prod_hip); - this->rsorted_ref->apply(alpha_ref, x_ref, beta_ref, prod_ref); + if (std::is_same::value) { + ASSERT_THROW(rand_hip->apply(alpha, x_hip, beta, prod_hip), + gko::NotImplemented); + } else { + rand_hip->apply(alpha, x_hip, beta, prod_hip); + this->rsorted_ref->apply(alpha_ref, x_ref, beta_ref, prod_ref); - const double tol = r::value; - GKO_ASSERT_MTX_NEAR(prod_ref, prod_hip, 5 * tol); + const double tol = r::value; + GKO_ASSERT_MTX_NEAR(prod_ref, prod_hip, 5 * tol); + } } @@ -273,11 +287,16 @@ TYPED_TEST(Fbcsr, AdvancedSpmvMultiIsEquivalentToRefSorted) auto beta = Dense::create(this->exec); beta->copy_from(beta_ref); - rand_hip->apply(alpha, x_hip, beta, prod_hip); - this->rsorted_ref->apply(alpha_ref, x_ref, beta_ref, prod_ref); + if (std::is_same::value) { + ASSERT_THROW(rand_hip->apply(alpha, x_hip, beta, prod_hip), + gko::NotImplemented); + } else { + rand_hip->apply(alpha, x_hip, beta, prod_hip); + this->rsorted_ref->apply(alpha_ref, x_ref, beta_ref, prod_ref); - const double tol = r::value; - GKO_ASSERT_MTX_NEAR(prod_ref, prod_hip, 5 * tol); + const double tol = r::value; + GKO_ASSERT_MTX_NEAR(prod_ref, prod_hip, 5 * tol); + } } diff --git a/include/ginkgo/core/base/half.hpp b/include/ginkgo/core/base/half.hpp index 6b21646c66c..de749d74222 100644 --- a/include/ginkgo/core/base/half.hpp +++ b/include/ginkgo/core/base/half.hpp @@ -522,17 +522,22 @@ class complex { typename = std::enable_if_t::value && std::is_scalar::value>> explicit complex(const T& real, const U& imag) - : complex(static_cast(real), static_cast(imag)) + : real_(static_cast(real)), + imag_(static_cast(imag)) {} template ::value>> - complex(const T& real) : complex(static_cast(real)) + complex(const T& real) + : real_(static_cast(real)), + imag_(static_cast(0.f)) {} + // When using complex(real, imag), MSVC with CUDA try to recognize the + // complex is a member not constructor. template ::value>> explicit complex(const complex& other) - : complex(static_cast(other.real()), - static_cast(other.imag())) + : real_(static_cast(other.real())), + imag_(static_cast(other.imag())) {} // explicit complex(const complex& other) = default; diff --git a/include/ginkgo/core/base/precision_dispatch.hpp b/include/ginkgo/core/base/precision_dispatch.hpp index 771cdc02af2..5a1e0ab9175 100644 --- a/include/ginkgo/core/base/precision_dispatch.hpp +++ b/include/ginkgo/core/base/precision_dispatch.hpp @@ -77,7 +77,8 @@ make_temporary_conversion(Ptr&& matrix) using Pointee = detail::pointee; using Dense = matrix::Dense; using NextDense = matrix::Dense>; - using NextNextDense = matrix::Dense>>; + using NextNextDense = + matrix::Dense>>; using MaybeConstDense = std::conditional_t::value, const Dense, Dense>; auto result = detail::temporary_conversion< diff --git a/omp/matrix/csr_kernels.cpp b/omp/matrix/csr_kernels.cpp index 7d4a5a7ebd1..73a2d404de3 100644 --- a/omp/matrix/csr_kernels.cpp +++ b/omp/matrix/csr_kernels.cpp @@ -127,8 +127,8 @@ void advanced_spmv(std::shared_ptr exec, auto row_ptrs = a->get_const_row_ptrs(); auto col_idxs = a->get_const_col_idxs(); - arithmetic_type valpha = alpha->at(0, 0); - arithmetic_type vbeta = beta->at(0, 0); + arithmetic_type valpha = static_cast(alpha->at(0, 0)); + arithmetic_type vbeta = static_cast(beta->at(0, 0)); const auto a_vals = acc::helper::build_const_rrm_accessor(a); diff --git a/omp/matrix/fft_kernels.cpp b/omp/matrix/fft_kernels.cpp index 1ec950282b2..beb15a7176c 100644 --- a/omp/matrix/fft_kernels.cpp +++ b/omp/matrix/fft_kernels.cpp @@ -220,7 +220,8 @@ void fft2(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE_NO_HALF(GKO_DECLARE_FFT2_KERNEL); +GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE_NO_HALF( + GKO_DECLARE_FFT2_KERNEL); template @@ -325,7 +326,8 @@ void fft3(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE_NO_HALF(GKO_DECLARE_FFT3_KERNEL); +GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE_NO_HALF( + GKO_DECLARE_FFT3_KERNEL); } // namespace fft diff --git a/omp/solver/cb_gmres_kernels.cpp b/omp/solver/cb_gmres_kernels.cpp index 1e60e45d734..9b5df4b1782 100644 --- a/omp/solver/cb_gmres_kernels.cpp +++ b/omp/solver/cb_gmres_kernels.cpp @@ -361,7 +361,8 @@ void initialize(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_NO_HALF(GKO_DECLARE_CB_GMRES_INITIALIZE_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_NO_HALF( + GKO_DECLARE_CB_GMRES_INITIALIZE_KERNEL); template diff --git a/reference/matrix/csr_kernels.cpp b/reference/matrix/csr_kernels.cpp index 3a05a09cd45..60c71357abd 100644 --- a/reference/matrix/csr_kernels.cpp +++ b/reference/matrix/csr_kernels.cpp @@ -124,8 +124,8 @@ void advanced_spmv(std::shared_ptr exec, auto row_ptrs = a->get_const_row_ptrs(); auto col_idxs = a->get_const_col_idxs(); - arithmetic_type valpha = alpha->at(0, 0); - arithmetic_type vbeta = beta->at(0, 0); + arithmetic_type valpha = static_cast(alpha->at(0, 0)); + arithmetic_type vbeta = static_cast(beta->at(0, 0)); const auto a_vals = acc::helper::build_const_rrm_accessor(a); diff --git a/reference/matrix/fft_kernels.cpp b/reference/matrix/fft_kernels.cpp index a81a4499c64..8a79f72f5f1 100644 --- a/reference/matrix/fft_kernels.cpp +++ b/reference/matrix/fft_kernels.cpp @@ -213,7 +213,8 @@ void fft2(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE_NO_HALF(GKO_DECLARE_FFT2_KERNEL); +GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE_NO_HALF( + GKO_DECLARE_FFT2_KERNEL); template @@ -313,7 +314,8 @@ void fft3(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE_NO_HALF(GKO_DECLARE_FFT3_KERNEL); +GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE_NO_HALF( + GKO_DECLARE_FFT3_KERNEL); } // namespace fft diff --git a/reference/solver/cb_gmres_kernels.cpp b/reference/solver/cb_gmres_kernels.cpp index b5dde273796..b24ee03f11b 100644 --- a/reference/solver/cb_gmres_kernels.cpp +++ b/reference/solver/cb_gmres_kernels.cpp @@ -325,7 +325,8 @@ void initialize(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_NO_HALF(GKO_DECLARE_CB_GMRES_INITIALIZE_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_NO_HALF( + GKO_DECLARE_CB_GMRES_INITIALIZE_KERNEL); template diff --git a/reference/test/factorization/lu_kernels.cpp b/reference/test/factorization/lu_kernels.cpp index 5cde9f132d3..9c6934dcb27 100644 --- a/reference/test/factorization/lu_kernels.cpp +++ b/reference/test/factorization/lu_kernels.cpp @@ -238,7 +238,7 @@ TYPED_TEST(Lu, KernelFactorizeWorks) diag_idxs.get_const_data(), this->mtx_lu.get(), tmp); GKO_ASSERT_MTX_NEAR(this->mtx_lu, mtx_lu_ref, - 15 * r::value); + 30 * r::value); }); } @@ -284,7 +284,7 @@ TYPED_TEST(Lu, FactorizeNonsymmetricWorks) GKO_ASSERT_MTX_EQ_SPARSITY(lu->get_combined(), this->mtx_lu); GKO_ASSERT_MTX_NEAR(lu->get_combined(), this->mtx_lu, - 15 * r::value); + 30 * r::value); ASSERT_EQ(lu->get_storage_type(), gko::experimental::factorization::storage_type::combined_lu); ASSERT_EQ(lu->get_lower_factor(), nullptr); @@ -311,7 +311,7 @@ TYPED_TEST(Lu, FactorizeWithKnownSparsityWorks) auto lu = factory->generate(this->mtx); GKO_ASSERT_MTX_NEAR(lu->get_combined(), this->mtx_lu, - 15 * r::value); + 30 * r::value); ASSERT_EQ(lu->get_storage_type(), gko::experimental::factorization::storage_type::combined_lu); ASSERT_EQ(lu->get_lower_factor(), nullptr); diff --git a/reference/test/preconditioner/isai_kernels.cpp b/reference/test/preconditioner/isai_kernels.cpp index eea171d60fe..b9d3eb9bcff 100644 --- a/reference/test/preconditioner/isai_kernels.cpp +++ b/reference/test/preconditioner/isai_kernels.cpp @@ -1013,6 +1013,8 @@ TYPED_TEST(Isai, ReturnsCorrectInverseA) TYPED_TEST(Isai, ReturnsCorrectInverseALongrow) { using value_type = typename TestFixture::value_type; + // TODO: figure out whether relaxed residual norm works in half or not. + SKIP_IF_HALF(value_type); const auto isai = this->general_isai_factory->generate(this->a_csr_longrow); auto a_inv = isai->get_approximate_inverse(); @@ -1029,6 +1031,7 @@ TYPED_TEST(Isai, ReturnsCorrectInverseALongrowWithExcessSolver) { using value_type = typename TestFixture::value_type; using GeneralIsai = typename TestFixture::GeneralIsai; + SKIP_IF_HALF(value_type); auto general_isai_factory = GeneralIsai::build() .with_excess_solver_factory(this->excess_solver_factory) @@ -1076,6 +1079,7 @@ TYPED_TEST(Isai, ReturnsCorrectInverseLLongrowWithExcessSolver) using Csr = typename TestFixture::Csr; using LowerIsai = typename TestFixture::LowerIsai; using value_type = typename TestFixture::value_type; + SKIP_IF_HALF(value_type); auto lower_isai_factory = LowerIsai::build() .with_excess_solver_factory(this->excess_solver_factory) @@ -1123,6 +1127,7 @@ TYPED_TEST(Isai, ReturnsCorrectInverseULongrowWithExcessSolver) using Csr = typename TestFixture::Csr; using UpperIsai = typename TestFixture::UpperIsai; using value_type = typename TestFixture::value_type; + SKIP_IF_HALF(value_type); auto upper_isai_factory = UpperIsai::build() .with_excess_solver_factory(this->excess_solver_factory) @@ -1223,6 +1228,7 @@ TYPED_TEST(Isai, ReturnsCorrectInverseSpdLongrow) { using Csr = typename TestFixture::Csr; using value_type = typename TestFixture::value_type; + SKIP_IF_HALF(value_type); const auto isai = this->spd_isai_factory->generate(this->spd_csr_longrow); const auto expected_transpose = gko::as(this->spd_csr_longrow_inv->transpose()); @@ -1246,6 +1252,7 @@ TYPED_TEST(Isai, ReturnsCorrectInverseSpdLongrowWithExcessSolver) using Csr = typename TestFixture::Csr; using SpdIsai = typename TestFixture::SpdIsai; using value_type = typename TestFixture::value_type; + SKIP_IF_HALF(value_type); const auto expected_transpose = gko::as(this->spd_csr_longrow_inv->transpose()); auto spd_isai_factory = diff --git a/reference/test/preconditioner/jacobi_kernels.cpp b/reference/test/preconditioner/jacobi_kernels.cpp index 9a2d3411cbb..4f755c35e0e 100644 --- a/reference/test/preconditioner/jacobi_kernels.cpp +++ b/reference/test/preconditioner/jacobi_kernels.cpp @@ -594,11 +594,14 @@ TYPED_TEST(Jacobi, SelectsCorrectBlockPrecisions) auto prec = bj->get_parameters().storage_optimization.block_wise.get_const_data(); - auto precision2 = std::is_same, float>::value - ? gko::precision_reduction(0, 0) // float - : gko::precision_reduction(0, 1); // double - EXPECT_EQ(prec[0], gko::precision_reduction(0, 2)); // u * cond = ~1.2e-3 - ASSERT_EQ(prec[1], precision2); // u * cond = ~2.0e-3 + auto precision1 = std::is_same, gko::half>::value + ? gko::precision_reduction(2, 0) + : gko::precision_reduction(0, 2); + auto precision2 = std::is_same, double>::value + ? gko::precision_reduction(0, 1) // double + : gko::precision_reduction(0, 0); // float, half + EXPECT_EQ(prec[0], precision1); // u * cond = ~1.2e-3 + ASSERT_EQ(prec[1], precision2); // u * cond = ~2.0e-3 } @@ -639,6 +642,9 @@ TYPED_TEST(Jacobi, AvoidsPrecisionsThatOverflow) auto precision = std::is_same, float>::value ? gko::precision_reduction(0, 2) // float : gko::precision_reduction(1, 1); // double + if (std::is_same, gko::half>::value) { + precision = gko::precision_reduction(2, 0); + } EXPECT_EQ(prec[0], precision); ASSERT_EQ(prec[1], precision); } diff --git a/reference/test/reorder/scaled_reordered.cpp b/reference/test/reorder/scaled_reordered.cpp index baeb1cf005d..3ebcaaf5506 100644 --- a/reference/test/reorder/scaled_reordered.cpp +++ b/reference/test/reorder/scaled_reordered.cpp @@ -396,6 +396,8 @@ TYPED_TEST(ScaledReordered, AppliesWithRcmReordering) TYPED_TEST(ScaledReordered, SolvesSingleRhsWithOnlyInnerOperator) { using SR = typename TestFixture::SR; + using value_type = typename TestFixture::value_type; + SKIP_IF_HALF(value_type); auto scaled_reordered_fact = SR::build().with_inner_operator(this->solver_factory).on(this->exec); auto scaled_reordered = scaled_reordered_fact->generate(this->rcm_mtx); @@ -442,6 +444,8 @@ TYPED_TEST(ScaledReordered, SolvesSingleRhsWithColScaling) TYPED_TEST(ScaledReordered, SolvesSingleRhsWithRcmReordering) { using SR = typename TestFixture::SR; + using value_type = typename TestFixture::value_type; + SKIP_IF_HALF(value_type); auto scaled_reordered_fact = SR::build() .with_reordering(this->rcm_factory) .with_inner_operator(this->solver_factory) @@ -478,6 +482,7 @@ TYPED_TEST(ScaledReordered, SolvesSingleRhsWithScalingAndRcmReorderingMixed) using SR = typename TestFixture::SR; using T = typename TestFixture::value_type; using Vec = gko::matrix::Dense>; + SKIP_IF_HALF(T); auto scaled_reordered_fact = SR::build() .with_row_scaling(this->diag2) .with_col_scaling(this->diag3) @@ -499,6 +504,8 @@ TYPED_TEST(ScaledReordered, AdvancedSolvesSingleRhsWithScalingAndRcmReordering) { using SR = typename TestFixture::SR; using Vec = typename TestFixture::Vec; + using T = typename TestFixture::value_type; + SKIP_IF_HALF(T); const auto alpha = gko::initialize({2.0}, this->exec); const auto beta = gko::initialize({-1.0}, this->exec); auto scaled_reordered_fact = SR::build() @@ -523,6 +530,7 @@ TYPED_TEST(ScaledReordered, using T = typename TestFixture::value_type; using value_type = next_precision; using Vec = gko::matrix::Dense; + SKIP_IF_HALF(T); auto scaled_reordered_fact = SR::build() .with_row_scaling(this->diag2) .with_col_scaling(this->diag3) diff --git a/reference/test/solver/bicgstab_kernels.cpp b/reference/test/solver/bicgstab_kernels.cpp index 56e11dd84bc..9b0ea1f6c80 100644 --- a/reference/test/solver/bicgstab_kernels.cpp +++ b/reference/test/solver/bicgstab_kernels.cpp @@ -528,7 +528,7 @@ TYPED_TEST(Bicgstab, SolvesDenseSystemUsingAdvancedApply) solver->apply(alpha, b, beta, x); - GKO_ASSERT_MTX_NEAR(x, l({-8.5, -3.0, 6.0}), r::value); + GKO_ASSERT_MTX_NEAR(x, l({-8.5, -3.0, 6.0}), 2 * r::value); } @@ -545,7 +545,7 @@ TYPED_TEST(Bicgstab, SolvesDenseSystemUsingAdvancedApplyMixed) solver->apply(alpha, b, beta, x); GKO_ASSERT_MTX_NEAR(x, l({-8.5, -3.0, 6.0}), - (r_mixed())); + (2 * r_mixed())); } @@ -561,14 +561,14 @@ TYPED_TEST(Bicgstab, SolvesDenseSystemUsingAdvancedApplyComplex) {value_type{-1.0, 2.0}, value_type{3.0, -6.0}, value_type{1.0, -2.0}}, this->exec); auto x = gko::initialize( - {value_type{0.5, -1.0}, value_type{1.0, -2.0}, value_type{2.0, -4.0}}, + {value_type{0.5, -0.5}, value_type{1.0, 0.5}, value_type{2.0, -1.0}}, this->exec); solver->apply(alpha, b, beta, x); GKO_ASSERT_MTX_NEAR(x, - l({value_type{-8.5, 17.0}, value_type{-3.0, 6.0}, - value_type{6.0, -12.0}}), + l({value_type{-8.5, 16.5}, value_type{-3.0, 3.5}, + value_type{6.0, -15.0}}), r::value); } @@ -586,14 +586,14 @@ TYPED_TEST(Bicgstab, SolvesDenseSystemUsingAdvancedApplyMixedComplex) {value_type{-1.0, 2.0}, value_type{3.0, -6.0}, value_type{1.0, -2.0}}, this->exec); auto x = gko::initialize( - {value_type{0.5, -1.0}, value_type{1.0, -2.0}, value_type{2.0, -4.0}}, + {value_type{0.5, -0.5}, value_type{1.0, 0.5}, value_type{2.0, -1.0}}, this->exec); solver->apply(alpha, b, beta, x); GKO_ASSERT_MTX_NEAR(x, - l({value_type{-8.5, 17.0}, value_type{-3.0, 6.0}, - value_type{6.0, -12.0}}), + l({value_type{-8.5, 16.5}, value_type{-3.0, 3.5}, + value_type{6.0, -15.0}}), (r_mixed())); } @@ -624,6 +624,7 @@ TYPED_TEST(Bicgstab, SolvesBigDenseSystemForDivergenceCheck1) { using Mtx = typename TestFixture::Mtx; using value_type = typename TestFixture::value_type; + SKIP_IF_HALF(value_type); auto half_tol = std::sqrt(r::value); std::shared_ptr locmtx = gko::initialize({{-19.0, 47.0, -41.0, 35.0, -21.0, 71.0}, @@ -652,6 +653,7 @@ TYPED_TEST(Bicgstab, SolvesBigDenseSystemForDivergenceCheck2) { using Mtx = typename TestFixture::Mtx; using value_type = typename TestFixture::value_type; + SKIP_IF_HALF(value_type); auto half_tol = std::sqrt(r::value); std::shared_ptr locmtx = gko::initialize({{-19.0, 47.0, -41.0, 35.0, -21.0, 71.0}, @@ -681,6 +683,7 @@ TYPED_TEST(Bicgstab, SolvesMultipleDenseSystemsDivergenceCheck) using Mtx = typename TestFixture::Mtx; using value_type = typename TestFixture::value_type; using T = value_type; + SKIP_IF_HALF(value_type); std::shared_ptr locmtx = gko::initialize({{-19.0, 47.0, -41.0, 35.0, -21.0, 71.0}, {-8.0, -66.0, 29.0, -96.0, -95.0, -14.0}, diff --git a/reference/test/solver/cgs_kernels.cpp b/reference/test/solver/cgs_kernels.cpp index 6f9c821025d..74bf08d08e8 100644 --- a/reference/test/solver/cgs_kernels.cpp +++ b/reference/test/solver/cgs_kernels.cpp @@ -445,6 +445,8 @@ TYPED_TEST(Cgs, SolvesDenseSystemUsingAdvancedApplyComplex) using Scalar = typename TestFixture::Mtx; using Mtx = gko::to_complex; using value_type = typename Mtx::value_type; + // different initial guess leads complex divergent. + SKIP_IF_HALF(value_type); auto solver = this->cgs_factory->generate(this->mtx); auto alpha = gko::initialize({2.0}, this->exec); auto beta = gko::initialize({-1.0}, this->exec); @@ -470,6 +472,8 @@ TYPED_TEST(Cgs, SolvesDenseSystemUsingAdvancedApplyMixedComplex) gko::matrix::Dense>; using Mtx = gko::to_complex; using value_type = typename Mtx::value_type; + // different initial guess leads complex divergent. + SKIP_IF_HALF(value_type); auto solver = this->cgs_factory->generate(this->mtx); auto alpha = gko::initialize({2.0}, this->exec); auto beta = gko::initialize({-1.0}, this->exec); @@ -514,6 +518,7 @@ TYPED_TEST(Cgs, SolvesBigDenseSystem1) { using Mtx = typename TestFixture::Mtx; using value_type = typename TestFixture::value_type; + SKIP_IF_HALF(value_type); auto solver = this->cgs_factory_big->generate(this->mtx_big); auto b = gko::initialize( {764.0, -4032.0, -11855.0, 7111.0, -12765.0, -4589}, this->exec); @@ -530,6 +535,7 @@ TYPED_TEST(Cgs, SolvesBigDenseSystemWithImplicitResNormCrit) { using Mtx = typename TestFixture::Mtx; using value_type = typename TestFixture::value_type; + SKIP_IF_HALF(value_type); auto solver = this->cgs_factory_big2->generate(this->mtx_big); auto b = gko::initialize( {17356.0, 5466.0, 748.0, -456.0, 3434.0, -7020.0}, this->exec); @@ -546,6 +552,7 @@ TYPED_TEST(Cgs, SolvesBigDenseSystem2) { using Mtx = typename TestFixture::Mtx; using value_type = typename TestFixture::value_type; + SKIP_IF_HALF(value_type); auto solver = this->cgs_factory_big->generate(this->mtx_big); auto b = gko::initialize( {17356.0, 5466.0, 748.0, -456.0, 3434.0, -7020.0}, this->exec); @@ -562,6 +569,7 @@ TYPED_TEST(Cgs, SolvesMultipleDenseSystems) { using Mtx = typename TestFixture::Mtx; using value_type = typename TestFixture::value_type; + SKIP_IF_HALF(value_type); auto solver = this->cgs_factory_big->generate(this->mtx_big); auto b1 = gko::initialize( {764.0, -4032.0, -11855.0, 7111.0, -12765.0, -4589}, this->exec); @@ -628,6 +636,7 @@ TYPED_TEST(Cgs, SolvesTransposedBigDenseSystem) { using Mtx = typename TestFixture::Mtx; using value_type = typename TestFixture::value_type; + SKIP_IF_HALF(value_type); auto solver = this->cgs_factory_big->generate(this->mtx_big->transpose()); auto b = gko::initialize( {764.0, -4032.0, -11855.0, 7111.0, -12765.0, -4589}, this->exec); @@ -644,6 +653,7 @@ TYPED_TEST(Cgs, SolvesConjTransposedBigDenseSystem) { using Mtx = typename TestFixture::Mtx; using value_type = typename TestFixture::value_type; + SKIP_IF_HALF(value_type); auto solver = this->cgs_factory_big->generate(this->mtx_big->conj_transpose()); auto b = gko::initialize( diff --git a/reference/test/solver/gcr_kernels.cpp b/reference/test/solver/gcr_kernels.cpp index 888cbc3b4fe..1afe813c12b 100644 --- a/reference/test/solver/gcr_kernels.cpp +++ b/reference/test/solver/gcr_kernels.cpp @@ -274,7 +274,7 @@ TYPED_TEST(Gcr, SolvesStencilSystemMixed) solver->apply(b.get(), x.get()); GKO_ASSERT_MTX_NEAR(x, l({1.0, 3.0, 2.0}), - (r_mixed())); + (r_mixed() * 1e1)); } @@ -319,7 +319,7 @@ TYPED_TEST(Gcr, SolvesStencilSystemMixedComplex) GKO_ASSERT_MTX_NEAR(x, l({value_type{1.0, -2.0}, value_type{3.0, -6.0}, value_type{2.0, -4.0}}), - (r_mixed())); + (r_mixed() * 1e1)); } @@ -370,7 +370,7 @@ TYPED_TEST(Gcr, SolvesStencilSystemUsingAdvancedApplyMixed) solver->apply(alpha.get(), b.get(), beta.get(), x.get()); GKO_ASSERT_MTX_NEAR(x, l({1.5, 5.0, 2.0}), - (r_mixed()) * 1e1); + (r_mixed()) * 1e2); } @@ -449,6 +449,7 @@ TYPED_TEST(Gcr, SolvesBigDenseSystem1) { using Mtx = typename TestFixture::Mtx; using value_type = typename TestFixture::value_type; + SKIP_IF_HALF(value_type); auto solver = this->gcr_factory_big->generate(this->mtx_big); auto b = gko::initialize( {72748.36, 297469.88, 347229.24, 36290.66, 82958.82, -80192.15}, @@ -466,6 +467,7 @@ TYPED_TEST(Gcr, SolvesBigDenseSystem2) { using Mtx = typename TestFixture::Mtx; using value_type = typename TestFixture::value_type; + SKIP_IF_HALF(value_type); auto solver = this->gcr_factory_big->generate(this->mtx_big); auto b = gko::initialize( {175352.10, 313410.50, 131114.10, -134116.30, 179529.30, -43564.90}, @@ -511,6 +513,7 @@ TYPED_TEST(Gcr, SolvesMultipleDenseSystemForDivergenceCheck) { using Mtx = typename TestFixture::Mtx; using value_type = typename TestFixture::value_type; + SKIP_IF_HALF(value_type); auto solver = this->gcr_factory_big->generate(this->mtx_big); auto b1 = gko::initialize( {1300083.0, 1018120.5, 906410.0, -42679.5, 846779.5, 1176858.5}, @@ -577,6 +580,7 @@ TYPED_TEST(Gcr, SolvesBigDenseSystem1WithRestart) using Mtx = typename TestFixture::Mtx; using Solver = typename TestFixture::Solver; using value_type = typename TestFixture::value_type; + SKIP_IF_HALF(value_type); auto half_tol = std::sqrt(r::value); auto gcr_factory_restart = Solver::build() @@ -605,6 +609,7 @@ TYPED_TEST(Gcr, SolvesWithPreconditioner) using Mtx = typename TestFixture::Mtx; using Solver = typename TestFixture::Solver; using value_type = typename TestFixture::value_type; + SKIP_IF_HALF(value_type); auto gcr_factory_preconditioner = Solver::build() .with_criteria( @@ -635,6 +640,7 @@ TYPED_TEST(Gcr, SolvesTransposedBigDenseSystem) { using Mtx = typename TestFixture::Mtx; using value_type = typename TestFixture::value_type; + SKIP_IF_HALF(value_type); auto solver = this->gcr_factory_big->generate(this->mtx_big->transpose()); auto b = gko::initialize( {72748.36, 297469.88, 347229.24, 36290.66, 82958.82, -80192.15}, @@ -652,6 +658,7 @@ TYPED_TEST(Gcr, SolvesConjTransposedBigDenseSystem) { using Mtx = typename TestFixture::Mtx; using value_type = typename TestFixture::value_type; + SKIP_IF_HALF(value_type); auto solver = this->gcr_factory_big->generate(this->mtx_big->conj_transpose()); auto b = gko::initialize( diff --git a/reference/test/solver/gmres_kernels.cpp b/reference/test/solver/gmres_kernels.cpp index 002245c523c..e34ffc95e23 100644 --- a/reference/test/solver/gmres_kernels.cpp +++ b/reference/test/solver/gmres_kernels.cpp @@ -724,6 +724,7 @@ TYPED_TEST(Gmres, SolvesBigDenseSystem1WithRestart) using Mtx = typename TestFixture::Mtx; using Solver = typename TestFixture::Solver; using value_type = typename TestFixture::value_type; + SKIP_IF_HALF(value_type); auto half_tol = std::sqrt(r::value); auto gmres_factory_restart = Solver::build() diff --git a/reference/test/solver/idr_kernels.cpp b/reference/test/solver/idr_kernels.cpp index a1154ccb598..7ad933142e9 100644 --- a/reference/test/solver/idr_kernels.cpp +++ b/reference/test/solver/idr_kernels.cpp @@ -95,7 +95,10 @@ class Idr : public ::testing::Test { std::unique_ptr idr_factory_precision; }; -TYPED_TEST_SUITE(Idr, gko::test::ValueTypes, TypenameNameGenerator); +// Solves((Conj)Trans)DenseSystem((Mixed)Complex) does not work in some default +// random generator from different environments. All tests will SKIP half, so we +// do not test half here. +TYPED_TEST_SUITE(Idr, gko::test::ValueTypesNoHalf, TypenameNameGenerator); TYPED_TEST(Idr, SolvesDenseSystem) @@ -114,7 +117,8 @@ TYPED_TEST(Idr, SolvesDenseSystem) TYPED_TEST(Idr, SolvesDenseSystemMixed) { - using value_type = next_precision; + using T = typename TestFixture::value_type; + using value_type = next_precision; using Mtx = gko::matrix::Dense; auto solver = this->idr_factory->generate(this->mtx); auto b = gko::initialize({-1.0, 3.0, 1.0}, this->exec); @@ -129,6 +133,7 @@ TYPED_TEST(Idr, SolvesDenseSystemMixed) TYPED_TEST(Idr, SolvesDenseSystemComplex) { + using T = typename TestFixture::value_type; using Mtx = gko::to_complex; using value_type = typename Mtx::value_type; auto solver = this->idr_factory->generate(this->mtx); @@ -150,8 +155,8 @@ TYPED_TEST(Idr, SolvesDenseSystemComplex) TYPED_TEST(Idr, SolvesDenseSystemMixedComplex) { - using value_type = - gko::to_complex>; + using T = typename TestFixture::value_type; + using value_type = gko::to_complex>; using Mtx = gko::matrix::Dense; auto solver = this->idr_factory->generate(this->mtx); auto b = gko::initialize( @@ -175,6 +180,7 @@ TYPED_TEST(Idr, SolvesDenseSystemWithComplexSubSpace) using Mtx = typename TestFixture::Mtx; using value_type = typename TestFixture::value_type; using Solver = typename TestFixture::Solver; + // intermediate value is too small to represent in half auto half_tol = std::sqrt(r::value); auto solver_factory = Solver::build() @@ -271,6 +277,7 @@ TYPED_TEST(Idr, SolvesDenseSystemUsingAdvancedApplyMixed) { using value_type = next_precision; using Mtx = gko::matrix::Dense; + SKIP_IF_HALF(typename TestFixture::value_type); auto solver = this->idr_factory->generate(this->mtx); auto alpha = gko::initialize({2.0}, this->exec); auto beta = gko::initialize({-1.0}, this->exec); @@ -338,6 +345,7 @@ TYPED_TEST(Idr, SolvesMultipleDenseSystemsUsingAdvancedApply) using Mtx = typename TestFixture::Mtx; using value_type = typename TestFixture::value_type; using T = value_type; + SKIP_IF_HALF(T); auto half_tol = std::sqrt(r::value); auto solver = this->idr_factory->generate(this->mtx); auto alpha = gko::initialize({2.0}, this->exec); diff --git a/test/base/device_matrix_data_kernels.cpp b/test/base/device_matrix_data_kernels.cpp index 0d211896f06..67105c8e2e2 100644 --- a/test/base/device_matrix_data_kernels.cpp +++ b/test/base/device_matrix_data_kernels.cpp @@ -67,8 +67,7 @@ class DeviceMatrixData : public CommonTestFixture { 0, host_data.size[0] - 1); std::uniform_int_distribution col_distr( 0, host_data.size[1] - 1); - std::uniform_real_distribution<> - val_distr(1.0, 2.0); + std::uniform_real_distribution<> val_distr(1.0, 2.0); // add random entries for (int i = 0; i < 1000; i++) { host_data.nonzeros.emplace_back( diff --git a/test/factorization/par_ic_kernels.cpp b/test/factorization/par_ic_kernels.cpp index 25c7c6450af..62751309f08 100644 --- a/test/factorization/par_ic_kernels.cpp +++ b/test/factorization/par_ic_kernels.cpp @@ -73,8 +73,7 @@ class ParIc : public CommonTestFixture { mtx_l = gko::test::generate_random_lower_triangular_matrix( mtx_size[0], false, std::uniform_int_distribution(10, mtx_size[0]), - std::normal_distribution<>(0, 10.0), - rand_engine, ref); + std::normal_distribution<>(0, 10.0), rand_engine, ref); dmtx_ani = Csr::create(exec); dmtx_l_ani = Csr::create(exec); dmtx_l_ani_init = Csr::create(exec); @@ -139,6 +138,8 @@ TYPED_TEST(ParIc, KernelComputeFactorIsEquivalentToRef) { using Csr = typename TestFixture::Csr; using Coo = typename TestFixture::Coo; + using value_type = typename TestFixture::value_type; + SKIP_IF_HALF(value_type); auto square_size = this->mtx_ani->get_size(); auto mtx_l_coo = Coo::create(this->ref, square_size); this->mtx_l_ani->convert_to(mtx_l_coo); diff --git a/test/factorization/par_ict_kernels.cpp b/test/factorization/par_ict_kernels.cpp index 514abf1bbe3..c379324172f 100644 --- a/test/factorization/par_ict_kernels.cpp +++ b/test/factorization/par_ict_kernels.cpp @@ -155,6 +155,8 @@ TYPED_TEST(ParIct, KernelComputeFactorIsEquivalentToRef) { using Csr = typename TestFixture::Csr; using Coo = typename TestFixture::Coo; + using value_type = typename TestFixture::value_type; + SKIP_IF_HALF(value_type); auto square_size = this->mtx_ani->get_size(); auto mtx_l_coo = Coo::create(this->ref, square_size); this->mtx_l_ani->convert_to(mtx_l_coo); diff --git a/test/factorization/par_ilu_kernels.cpp b/test/factorization/par_ilu_kernels.cpp index 480f62057b8..22f4b740fd7 100644 --- a/test/factorization/par_ilu_kernels.cpp +++ b/test/factorization/par_ilu_kernels.cpp @@ -90,8 +90,7 @@ class ParIlu : public CommonTestFixture { return gko::test::generate_random_matrix( num_rows, num_cols, std::uniform_int_distribution(0, num_cols - 1), - std::normal_distribution<>(0.0, 1.0), - rand_engine, ref); + std::normal_distribution<>(0.0, 1.0), rand_engine, ref); } std::unique_ptr gen_unsorted_mtx(index_type num_rows, @@ -277,6 +276,8 @@ TYPED_TEST(ParIlu, KernelInitializeParILUIsEquivalentToRef) TYPED_TEST(ParIlu, KernelComputeParILUIsEquivalentToRef) { using Csr = typename TestFixture::Csr; + using value_type = typename TestFixture::value_type; + SKIP_IF_HALF(value_type); std::unique_ptr l_mtx{}; std::unique_ptr u_mtx{}; std::unique_ptr dl_mtx{}; @@ -295,6 +296,7 @@ TYPED_TEST(ParIlu, KernelComputeParILUWithMoreIterationsIsEquivalentToRef) { using Csr = typename TestFixture::Csr; using value_type = typename TestFixture::value_type; + SKIP_IF_HALF(value_type); std::unique_ptr l_mtx{}; std::unique_ptr u_mtx{}; std::unique_ptr dl_mtx{}; diff --git a/test/factorization/par_ilut_kernels.cpp b/test/factorization/par_ilut_kernels.cpp index b664b4c4108..2bd53c19717 100644 --- a/test/factorization/par_ilut_kernels.cpp +++ b/test/factorization/par_ilut_kernels.cpp @@ -396,6 +396,8 @@ TYPED_TEST(ParIlut, KernelAddCandidatesIsEquivalentToRef) { using Csr = typename TestFixture::Csr; using value_type = typename TestFixture::value_type; + // there's one value larger than half range + SKIP_IF_HALF(value_type); auto square_size = this->mtx_square->get_size(); auto mtx_lu = Csr::create(this->ref, square_size); this->mtx_l2->apply(this->mtx_u, mtx_lu); @@ -424,6 +426,8 @@ TYPED_TEST(ParIlut, KernelComputeLUIsEquivalentToRef) { using Csr = typename TestFixture::Csr; using Coo = typename TestFixture::Coo; + using value_type = typename TestFixture::value_type; + SKIP_IF_HALF(value_type); auto square_size = this->mtx_ani->get_size(); auto mtx_l_coo = Coo::create(this->ref, square_size); auto mtx_u_coo = Coo::create(this->ref, square_size); diff --git a/test/matrix/fft_kernels.cpp b/test/matrix/fft_kernels.cpp index b598bb69bce..1d59d687a66 100644 --- a/test/matrix/fft_kernels.cpp +++ b/test/matrix/fft_kernels.cpp @@ -122,7 +122,8 @@ class Fft : public CommonTestFixture { }; -TYPED_TEST_SUITE(Fft, gko::test::ComplexValueTypesNoHalf, TypenameNameGenerator); +TYPED_TEST_SUITE(Fft, gko::test::ComplexValueTypesNoHalf, + TypenameNameGenerator); TYPED_TEST(Fft, Apply1DIsEqualToReference) diff --git a/test/mpi/matrix.cpp b/test/mpi/matrix.cpp index adfe63e5770..0c1fc2c1a36 100644 --- a/test/mpi/matrix.cpp +++ b/test/mpi/matrix.cpp @@ -438,7 +438,7 @@ TYPED_TEST(Matrix, CanConvertToNextPrecision) using csr = typename TestFixture::local_matrix_type; using local_index_type = typename TestFixture::local_index_type; using global_index_type = typename TestFixture::global_index_type; - using OtherT = next_precision; + using OtherT = next_precision; using OtherDist = typename gko::experimental::distributed::Matrix< OtherT, local_index_type, global_index_type>; auto tmp = OtherDist::create(this->ref, this->comm); @@ -464,7 +464,7 @@ TYPED_TEST(Matrix, CanMoveToNextPrecision) using csr = typename TestFixture::local_matrix_type; using local_index_type = typename TestFixture::local_index_type; using global_index_type = typename TestFixture::global_index_type; - using OtherT = next_precision; + using OtherT = next_precision; using OtherDist = typename gko::experimental::distributed::Matrix< OtherT, local_index_type, global_index_type>; auto tmp = OtherDist::create(this->ref, this->comm); diff --git a/test/mpi/vector.cpp b/test/mpi/vector.cpp index fe3b81406c5..3c568d8208c 100644 --- a/test/mpi/vector.cpp +++ b/test/mpi/vector.cpp @@ -545,7 +545,7 @@ class VectorReductions : public CommonMpiTestFixture { std::default_random_engine engine; }; -TYPED_TEST_SUITE(VectorReductions, gko::test::ValueTypes, +TYPED_TEST_SUITE(VectorReductions, gko::test::ValueTypesNoHalf, TypenameNameGenerator); @@ -770,8 +770,7 @@ class VectorLocalOps : public CommonMpiTestFixture { local_size[0], local_size[1], std::uniform_int_distribution(local_size[1], local_size[1]), - std::normal_distribution>(), engine, - exec); + std::normal_distribution<>(), engine, exec); dist = DistVectorType::create(exec, comm, size, gko::clone(local)); } @@ -783,8 +782,7 @@ class VectorLocalOps : public CommonMpiTestFixture { alpha = gko::test::generate_random_matrix( 1, size[1], std::uniform_int_distribution(size[1], size[1]), - std::normal_distribution>(), engine, - exec); + std::normal_distribution<>(), engine, exec); } void init_complex_vectors() @@ -847,7 +845,7 @@ TYPED_TEST(VectorLocalOps, AdvancedApplyNotSupported) TYPED_TEST(VectorLocalOps, ConvertsToPrecision) { using T = typename TestFixture::value_type; - using OtherT = next_precision; + using OtherT = next_precision; using OtherVector = typename gko::experimental::distributed::Vector; auto local_tmp = OtherVector::local_vector_type::create(this->exec); auto tmp = OtherVector::create(this->exec, this->comm); @@ -863,7 +861,7 @@ TYPED_TEST(VectorLocalOps, ConvertsToPrecision) TYPED_TEST(VectorLocalOps, MovesToPrecision) { using T = typename TestFixture::value_type; - using OtherT = next_precision; + using OtherT = next_precision; using OtherVector = typename gko::experimental::distributed::Vector; auto local_tmp = OtherVector::local_vector_type::create(this->exec); auto tmp = OtherVector::create(this->exec, this->comm); @@ -978,8 +976,7 @@ TYPED_TEST(VectorLocalOps, FillSameAsLocal) { using value_type = typename TestFixture::value_type; auto value = gko::test::detail::get_rand_value( - std::normal_distribution>(), - this->engine); + std::normal_distribution<>(), this->engine); this->init_vectors(); this->x->fill(value); From 2d7f4cbc7fa4c9d4b1e3ce461319d1e221fa4724 Mon Sep 17 00:00:00 2001 From: "Yuhsiang M. Tsai" Date: Wed, 21 Jun 2023 22:28:16 +0200 Subject: [PATCH 28/48] mkl csr does not support half --- dpcpp/matrix/csr_kernels.dp.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/dpcpp/matrix/csr_kernels.dp.cpp b/dpcpp/matrix/csr_kernels.dp.cpp index 11d583fc284..863cd7e2520 100644 --- a/dpcpp/matrix/csr_kernels.dp.cpp +++ b/dpcpp/matrix/csr_kernels.dp.cpp @@ -1247,8 +1247,9 @@ bool try_general_sparselib_spmv(std::shared_ptr exec, const ValueType host_beta, matrix::Dense* c) { - bool try_sparselib = !is_complex(); - if (try_sparselib) { + constexpr bool try_sparselib = + !is_complex() && !std::is_same::value; + if constexpr (try_sparselib) { oneapi::mkl::sparse::matrix_handle_t mat_handle; oneapi::mkl::sparse::init_matrix_handle(&mat_handle); oneapi::mkl::sparse::set_csr_data( From b04c993fba1903964b89988a9e8f31a2e81507f6 Mon Sep 17 00:00:00 2001 From: "Yu-Hsiang M. Tsai" Date: Thu, 7 Sep 2023 10:49:59 +0200 Subject: [PATCH 29/48] add half to batch_vector --- core/base/batch_multi_vector.cpp | 19 ++++++++++++++++ .../ginkgo/core/base/batch_multi_vector.hpp | 22 +++++++++++++++++-- .../test/base/batch_multi_vector_kernels.cpp | 4 ++-- 3 files changed, 41 insertions(+), 4 deletions(-) diff --git a/core/base/batch_multi_vector.cpp b/core/base/batch_multi_vector.cpp index 23591cd1ffe..3774b6aad58 100644 --- a/core/base/batch_multi_vector.cpp +++ b/core/base/batch_multi_vector.cpp @@ -290,6 +290,25 @@ void MultiVector::move_to( } +#if GINKGO_ENABLE_HALF +template +void MultiVector::convert_to( + MultiVector>>* result) const +{ + result->values_ = this->values_; + result->set_size(this->get_size()); +} + + +template +void MultiVector::move_to( + MultiVector>>* result) +{ + this->convert_to(result); +} +#endif + + #define GKO_DECLARE_BATCH_MULTI_VECTOR(_type) class MultiVector<_type> GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_MULTI_VECTOR); diff --git a/include/ginkgo/core/base/batch_multi_vector.hpp b/include/ginkgo/core/base/batch_multi_vector.hpp index d91274526d3..dd1ee930ca7 100644 --- a/include/ginkgo/core/base/batch_multi_vector.hpp +++ b/include/ginkgo/core/base/batch_multi_vector.hpp @@ -81,11 +81,15 @@ class MultiVector : public EnablePolymorphicObject>, public EnablePolymorphicAssignment>, public EnableCreateMethod>, +#if GINKGO_ENABLE_HALF + public ConvertibleTo< + MultiVector>>>, +#endif public ConvertibleTo>> { friend class EnableCreateMethod; friend class EnablePolymorphicObject; friend class MultiVector>; - friend class MultiVector>; + friend class MultiVector>; public: using EnablePolymorphicAssignment::convert_to; @@ -113,6 +117,20 @@ class MultiVector void move_to(MultiVector>* result) override; +#if GINKGO_ENABLE_HALF + friend class MultiVector>>; + using ConvertibleTo< + MultiVector>>>::convert_to; + using ConvertibleTo< + MultiVector>>>::move_to; + + void convert_to(MultiVector>>* + result) const override; + + void move_to(MultiVector>>* result) + override; +#endif + /** * Creates a mutable view (of matrix::Dense type) of one item of the Batch * MultiVector object. Does not perform any deep copies, but only returns a @@ -430,7 +448,7 @@ class MultiVector private: batch_dim<2> batch_size_; array values_; -}; +}; // namespace batch /** diff --git a/reference/test/base/batch_multi_vector_kernels.cpp b/reference/test/base/batch_multi_vector_kernels.cpp index 4f922c37703..21136a172ae 100644 --- a/reference/test/base/batch_multi_vector_kernels.cpp +++ b/reference/test/base/batch_multi_vector_kernels.cpp @@ -354,7 +354,7 @@ TYPED_TEST(MultiVector, ConvertsToPrecision) // If OtherT is more precise: 0, otherwise r auto residual = r::value < r::value ? gko::remove_complex{0} - : gko::remove_complex{r::value}; + : static_cast>(r::value); this->mtx_1->convert_to(tmp.get()); tmp->convert_to(res.get()); @@ -377,7 +377,7 @@ TYPED_TEST(MultiVector, MovesToPrecision) // If OtherT is more precise: 0, otherwise r auto residual = r::value < r::value ? gko::remove_complex{0} - : gko::remove_complex{r::value}; + : static_cast>(r::value); this->mtx_1->move_to(tmp.get()); tmp->move_to(res.get()); From 10bb4ae882449193eb7d800aa4d254fc5120568a Mon Sep 17 00:00:00 2001 From: "Yu-Hsiang M. Tsai" Date: Tue, 12 Sep 2023 17:02:42 +0200 Subject: [PATCH 30/48] fix hip thrust complex op, avoid const in nvhpc, reduce job in windows --- .github/workflows/windows-mingw.yml | 2 +- .github/workflows/windows-msvc-ref.yml | 7 ++++--- accessor/reference_helper.hpp | 6 ++++-- hip/base/types.hip.hpp | 2 +- 4 files changed, 10 insertions(+), 7 deletions(-) diff --git a/.github/workflows/windows-mingw.yml b/.github/workflows/windows-mingw.yml index fc1906c982d..d70b94b8b28 100644 --- a/.github/workflows/windows-mingw.yml +++ b/.github/workflows/windows-mingw.yml @@ -50,7 +50,7 @@ jobs: mkdir build cd build cmake -G "MinGW Makefiles" -DBUILD_SHARED_LIBS=${{ matrix.config.shared }} -DCMAKE_BUILD_TYPE=${{ matrix.config.build_type }} -DGINKGO_COMPILER_FLAGS=${{ matrix.config.cflags }} .. - cmake --build . -j4 + cmake --build . -j2 shell: cmd - name: install diff --git a/.github/workflows/windows-msvc-ref.yml b/.github/workflows/windows-msvc-ref.yml index f7d73e2fd82..a5be64c9daa 100644 --- a/.github/workflows/windows-msvc-ref.yml +++ b/.github/workflows/windows-msvc-ref.yml @@ -27,8 +27,9 @@ jobs: fail-fast: false matrix: config: - - {shared: "ON", build_type: "Debug", name: "reference/debug/shared"} - - {shared: "OFF", build_type: "Release", name: "reference/release/static"} + # Debug with half precision has the issue "library limit of 65535 objects exceeded" + - {shared: "ON", build_type: "Debug", name: "reference/debug/shared", half: "OFF"} + - {shared: "OFF", build_type: "Release", name: "reference/release/static", half: "ON"} # Debug static needs too much storage # - {shared: "OFF", build_type: "Debug", name: "reference/debug/static"} name: msvc/${{ matrix.config.name }} @@ -47,7 +48,7 @@ jobs: run: | mkdir build cd build - cmake -DBUILD_SHARED_LIBS=${{ matrix.config.shared }} -DCMAKE_CXX_FLAGS_DEBUG='/MDd /Zi /Ob1 /O1 /Od /RTC1' -DGINKGO_BUILD_CUDA=OFF -DGINKGO_BUILD_OMP=OFF .. + cmake -DBUILD_SHARED_LIBS=${{ matrix.config.shared }} -DCMAKE_CXX_FLAGS_DEBUG='/MDd /Zi /Ob1 /O1 /Od /RTC1' -DGINKGO_BUILD_CUDA=OFF -DGINKGO_BUILD_OMP=OFF -DGINKGO_ENABLE_HALF=${{ matrix.config.half }}.. cmake --build . -j4 --config ${{ matrix.config.build_type }} ctest . -C ${{ matrix.config.build_type }} --output-on-failure diff --git a/accessor/reference_helper.hpp b/accessor/reference_helper.hpp index 40dc4bebaf2..18d55712130 100644 --- a/accessor/reference_helper.hpp +++ b/accessor/reference_helper.hpp @@ -43,8 +43,10 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // CUDA TOOLKIT < 11 does not support constexpr in combination with // thrust::complex, which is why constexpr is only present in later versions -#if defined(__CUDA_ARCH__) && defined(__CUDACC_VER_MAJOR__) && \ - (__CUDACC_VER_MAJOR__ < 11) +// TODO: NVC++ constexpr +#if (defined(__CUDA_ARCH__) && defined(__CUDACC_VER_MAJOR__) && \ + (__CUDACC_VER_MAJOR__ < 11)) || \ + (defined(__NVCOMPILER) && GINKGO_ENABLE_HALF) #define GKO_ACC_ENABLE_REFERENCE_CONSTEXPR diff --git a/hip/base/types.hip.hpp b/hip/base/types.hip.hpp index 56fe09f3017..aa792e4edb7 100644 --- a/hip/base/types.hip.hpp +++ b/hip/base/types.hip.hpp @@ -106,7 +106,7 @@ GKO_ATTRIBUTES GKO_INLINE __half abs<__half>(const complex<__half>& z) GKO_ATTRIBUTES GKO_INLINE thrust::complex<__half> operator _op( \ const thrust::complex<__half> lhs, const thrust::complex<__half> rhs) \ { \ - return thrust::complex{lhs} + thrust::complex(rhs); \ + return thrust::complex{lhs} _op thrust::complex(rhs); \ } THRUST_HALF_FRIEND_OPERATOR(+, +=) From bf25352a5ee35968aad62d84718499a72c9038b5 Mon Sep 17 00:00:00 2001 From: "Yu-Hsiang M. Tsai" Date: Wed, 13 Sep 2023 16:21:12 +0200 Subject: [PATCH 31/48] fix nvc++ atomic, dpcpp half jacobi need to use value_type generator --- .../test/preconditioner/jacobi_kernels.dp.cpp | 23 ++++++++++--------- include/ginkgo/core/base/half.hpp | 16 ------------- include/ginkgo/core/base/math.hpp | 7 ------ include/ginkgo/core/base/types.hpp | 15 +++++------- omp/components/atomic.hpp | 9 ++++++++ 5 files changed, 27 insertions(+), 43 deletions(-) diff --git a/dpcpp/test/preconditioner/jacobi_kernels.dp.cpp b/dpcpp/test/preconditioner/jacobi_kernels.dp.cpp index a74a47f978e..f869a1b05ed 100644 --- a/dpcpp/test/preconditioner/jacobi_kernels.dp.cpp +++ b/dpcpp/test/preconditioner/jacobi_kernels.dp.cpp @@ -95,7 +95,7 @@ class Jacobi : public ::testing::Test { if (condition_numbers.size() == 0) { mtx = gko::test::generate_random_matrix( dim, dim, std::uniform_int_distribution<>(min_nnz, max_nnz), - std::normal_distribution<>(0.0, 1.0), engine, ref); + std::normal_distribution(0.0, 1.0), engine, ref); } else { std::vector blocks; for (gko::size_type i = 0; i < block_pointers.size() - 1; ++i) { @@ -103,7 +103,8 @@ class Jacobi : public ::testing::Test { begin(block_pointers)[i + 1] - begin(block_pointers)[i]; const auto cond = begin(condition_numbers)[i]; blocks.push_back(mtx_data::cond( - size, cond, std::normal_distribution<>(-1, 1), engine)); + size, cond, std::normal_distribution(-1, 1), + engine)); } mtx = Mtx::create(ref); mtx->read(mtx_data::diag(begin(blocks), end(blocks))); @@ -139,11 +140,11 @@ class Jacobi : public ::testing::Test { } b = gko::test::generate_random_matrix( dim, num_rhs, std::uniform_int_distribution<>(num_rhs, num_rhs), - std::normal_distribution<>(0.0, 1.0), engine, ref); + std::normal_distribution(0.0, 1.0), engine, ref); d_b = gko::clone(dpcpp, b); x = gko::test::generate_random_matrix( dim, num_rhs, std::uniform_int_distribution<>(num_rhs, num_rhs), - std::normal_distribution<>(0.0, 1.0), engine, ref); + std::normal_distribution(0.0, 1.0), engine, ref); d_x = gko::clone(dpcpp, x); } @@ -433,7 +434,7 @@ TEST_F(Jacobi, DpcppScalarApplyEquivalentToRef) auto dense_data = gko::test::generate_random_matrix_data( dim, dim, std::uniform_int_distribution<>(1, dim), - std::normal_distribution<>(1.0, 2.0), engine); + std::normal_distribution(1.0, 2.0), engine); gko::utils::make_diag_dominant(dense_data); auto dense_smtx = gko::share(Vec::create(ref)); dense_smtx->read(dense_data); @@ -441,7 +442,7 @@ TEST_F(Jacobi, DpcppScalarApplyEquivalentToRef) smtx->copy_from(dense_smtx); auto sb = gko::share(gko::test::generate_random_matrix( dim, 3, std::uniform_int_distribution<>(1, 1), - std::normal_distribution<>(0.0, 1.0), engine, ref)); + std::normal_distribution(0.0, 1.0), engine, ref)); auto sx = Vec::create(ref, sb->get_size()); auto d_smtx = gko::share(Mtx::create(dpcpp)); @@ -485,7 +486,7 @@ TEST_F(Jacobi, DpcppScalarLinearCombinationApplyEquivalentToRef) auto dense_data = gko::test::generate_random_matrix_data( dim, dim, std::uniform_int_distribution<>(1, dim), - std::normal_distribution<>(1.0, 2.0), engine); + std::normal_distribution(1.0, 2.0), engine); gko::utils::make_diag_dominant(dense_data); auto dense_smtx = gko::share(Vec::create(ref)); dense_smtx->read(dense_data); @@ -493,12 +494,12 @@ TEST_F(Jacobi, DpcppScalarLinearCombinationApplyEquivalentToRef) smtx->copy_from(dense_smtx); auto sb = gko::share(gko::test::generate_random_matrix( dim, 3, std::uniform_int_distribution<>(1, 1), - std::normal_distribution<>(0.0, 1.0), engine, ref, gko::dim<2>(dim, 3), - 4)); + std::normal_distribution(0.0, 1.0), engine, ref, + gko::dim<2>(dim, 3), 4)); auto sx = gko::share(gko::test::generate_random_matrix( dim, 3, std::uniform_int_distribution<>(1, 1), - std::normal_distribution<>(0.0, 1.0), engine, ref, gko::dim<2>(dim, 3), - 4)); + std::normal_distribution(0.0, 1.0), engine, ref, + gko::dim<2>(dim, 3), 4)); auto d_smtx = gko::share(gko::clone(dpcpp, smtx)); auto d_sb = gko::share(gko::clone(dpcpp, sb)); diff --git a/include/ginkgo/core/base/half.hpp b/include/ginkgo/core/base/half.hpp index de749d74222..218a487e1a4 100644 --- a/include/ginkgo/core/base/half.hpp +++ b/include/ginkgo/core/base/half.hpp @@ -42,9 +42,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include -#ifdef SYCL_LANGUAGE_VERSION -#include -#endif #ifdef __CUDA_ARCH__ @@ -322,14 +319,6 @@ struct precision_converter { } // namespace detail -// sycl::half miss the arithmetic operator to result float not half before 5.7 -// (2022-06). It leads ? half : half/half ambiguous The same issue is reported -// in https://github.com/intel/llvm/issues/6028 -#if defined(SYCL_LANGUAGE_VERSION) && \ - (__LIBSYCL_MAJOR_VERSION > 5 || \ - (__LIBSYCL_MAJOR_VERSION == 5 && __LIBSYCL_MINOR_VERSION >= 7)) -using half = sycl::half; -#else /** * A class providing basic support for half precision floating point types. * @@ -500,7 +489,6 @@ class half { uint16 data_; }; -#endif } // namespace gko @@ -662,9 +650,6 @@ class complex { value_type imag_; }; -#if !(defined(SYCL_LANGUAGE_VERSION) && \ - (__LIBSYCL_MAJOR_VERSION > 5 || \ - (__LIBSYCL_MAJOR_VERSION == 5 && __LIBSYCL_MINOR_VERSION >= 7))) template <> struct numeric_limits { static constexpr bool is_specialized{true}; @@ -700,7 +685,6 @@ struct numeric_limits { } }; -#endif // complex using a template on operator= for any kind of complex, so we can // do full specialization for half diff --git a/include/ginkgo/core/base/math.hpp b/include/ginkgo/core/base/math.hpp index 4aa4a70de0e..193fb473b1f 100644 --- a/include/ginkgo/core/base/math.hpp +++ b/include/ginkgo/core/base/math.hpp @@ -48,13 +48,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include -// Using SYCL_LANGUAGE_VERSION will lead the mismatch sycl namespace from 6.0.0 -// when using dpcpp compiler without dpcpp module -#if GINKGO_DPCPP_MAJOR_VERSION -#include -#endif - - class __half; diff --git a/include/ginkgo/core/base/types.hpp b/include/ginkgo/core/base/types.hpp index 1765795f256..bb345207199 100644 --- a/include/ginkgo/core/base/types.hpp +++ b/include/ginkgo/core/base/types.hpp @@ -53,9 +53,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #endif // __HIPCC__ -#ifdef SYCL_LANGUAGE_VERSION -#include -#endif // Macros for handling different compilers / architectures uniformly #if defined(__CUDACC__) || defined(__HIPCC__) @@ -162,13 +159,13 @@ using uint64 = std::uint64_t; */ using uintptr = std::uintptr_t; -#if defined(SYCL_LANGUAGE_VERSION) && \ - (__LIBSYCL_MAJOR_VERSION > 5 || \ - (__LIBSYCL_MAJOR_VERSION == 5 && __LIBSYCL_MINOR_VERSION >= 7)) -using half = sycl::half; -#else +// #if defined(SYCL_LANGUAGE_VERSION) && \ +// (__LIBSYCL_MAJOR_VERSION > 5 || \ +// (__LIBSYCL_MAJOR_VERSION == 5 && __LIBSYCL_MINOR_VERSION >= 7)) +// using half = sycl::half; +// #else class half; -#endif +// #endif /** diff --git a/omp/components/atomic.hpp b/omp/components/atomic.hpp index f2a0d9d5d86..8d5e1749974 100644 --- a/omp/components/atomic.hpp +++ b/omp/components/atomic.hpp @@ -80,6 +80,14 @@ inline ResultType reinterpret(ValueType val) template <> void atomic_add(half& out, half val) { +#ifdef __NVCOMPILER +// NVC++ uses atomic capture on uint16 leads the following error. +// use of undefined value '%L.B*' br label %L.B* !llvm.loop !*, !dbg !* +#pragma omp critical + { + out += val; + } +#else // UB? uint16_t* address_as_converter = reinterpret_cast(&out); uint16_t old = *address_as_converter; @@ -93,6 +101,7 @@ void atomic_add(half& out, half val) *address_as_converter = (old == assumed) ? answer : old; } } while (assumed != old); +#endif } From cf9c2181b90ca06142171a5f6847abbfaf60f3a8 Mon Sep 17 00:00:00 2001 From: "Yuhsiang M. Tsai" Date: Thu, 14 Sep 2023 15:04:20 +0200 Subject: [PATCH 32/48] make half test optional --- core/test/utils.hpp | 108 ++++++++++++++++++++++++-------------------- 1 file changed, 60 insertions(+), 48 deletions(-) diff --git a/core/test/utils.hpp b/core/test/utils.hpp index 16e418d433e..90ef3d20827 100644 --- a/core/test/utils.hpp +++ b/core/test/utils.hpp @@ -62,14 +62,20 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. namespace gko { namespace test { +#if GINKGO_ENABLE_HALF +#define OPTIONAL(...) __VA_ARGS__, +#else +#define OPTIONAL(...) +#endif using ValueTypes = #if GINKGO_DPCPP_SINGLE_MODE - ::testing::Types, - std::complex>; + ::testing::Types) std::complex>; #else - ::testing::Types, - std::complex, std::complex>; + ::testing::Types) std::complex, + std::complex>; #endif using ValueTypesNoHalf = @@ -81,9 +87,9 @@ using ValueTypesNoHalf = using ComplexValueTypes = #if GINKGO_DPCPP_SINGLE_MODE - ::testing::Types, std::complex>; + ::testing::Types) std::complex>; #else - ::testing::Types, std::complex, + ::testing::Types) std::complex, std::complex>; #endif @@ -96,9 +102,9 @@ using ComplexValueTypesNoHalf = using RealValueTypes = #if GINKGO_DPCPP_SINGLE_MODE - ::testing::Types; + ::testing::Types; #else - ::testing::Types; + ::testing::Types; #endif @@ -112,81 +118,87 @@ using LocalGlobalIndexTypes = using PODTypes = #if GINKGO_DPCPP_SINGLE_MODE - ::testing::Types; + ::testing::Types; #else - ::testing::Types; + ::testing::Types; #endif using ValueAndIndexTypes = #if GINKGO_DPCPP_SINGLE_MODE - ::testing::Types, - std::complex, gko::int32, gko::int64, - gko::size_type>; + ::testing::Types) std::complex, + gko::int32, gko::int64, gko::size_type>; #else - ::testing::Types, - std::complex, std::complex, gko::int32, - gko::int64, gko::size_type>; + ::testing::Types) std::complex, + std::complex, gko::int32, gko::int64, + gko::size_type>; #endif using RealValueAndIndexTypes = #if GINKGO_DPCPP_SINGLE_MODE - ::testing::Types; + ::testing::Types; #else - ::testing::Types; #endif using ValueIndexTypes = #if GINKGO_DPCPP_SINGLE_MODE - ::testing::Types, - std::tuple, - std::tuple, gko::int32>, - std::tuple, gko::int32>, + ::testing::Types) + std::tuple, + OPTIONAL(std::tuple, gko::int32>) + std::tuple, gko::int32>, std::tuple, std::tuple, gko::int64>>; #else - ::testing::Types< - std::tuple, std::tuple, - std::tuple, - std::tuple, gko::int32>, - std::tuple, gko::int32>, - std::tuple, gko::int32>, - std::tuple, std::tuple, - std::tuple, - std::tuple, gko::int64>, - std::tuple, gko::int64>, - std::tuple, gko::int64>>; + ::testing::Types) + std::tuple, + std::tuple, + OPTIONAL(std::tuple, gko::int32>) + std::tuple, gko::int32>, + std::tuple, gko::int32>, + OPTIONAL(std::tuple) + std::tuple, + std::tuple, + OPTIONAL(std::tuple, gko::int64>) + std::tuple, gko::int64>, + std::tuple, gko::int64>>; #endif using RealValueIndexTypes = #if GINKGO_DPCPP_SINGLE_MODE - ::testing::Types< - std::tuple, std::tuple, - std::tuple, std::tuple>; + ::testing::Types) + std::tuple, + OPTIONAL(std::tuple) + std::tuple>; #else - ::testing::Types< - std::tuple, std::tuple, - std::tuple, std::tuple, - std::tuple, std::tuple>; + ::testing::Types) + std::tuple, + std::tuple, + OPTIONAL(std::tuple) + std::tuple, + std::tuple>; #endif using ComplexValueIndexTypes = #if GINKGO_DPCPP_SINGLE_MODE - ::testing::Types, gko::int32>, - std::tuple, gko::int32>, - std::tuple, gko::int64>, - std::tuple, gko::int64>>; + ::testing::Types) gko::int32>, + std::tuple, gko::int32>, + OPTIONAL(std::tuple, gko::int64>) + std::tuple, gko::int64>> ; #else - ::testing::Types, gko::int32>, - std::tuple, gko::int32>, + ::testing::Types, gko::int32>) + std::tuple, gko::int32>, std::tuple, gko::int32>, - std::tuple, gko::int64>, - std::tuple, gko::int64>, + OPTIONAL(std::tuple, gko::int64>) + std::tuple, gko::int64>, std::tuple, gko::int64>>; #endif From 1e4b68b3470cf194c50a0f6f3b6de90773d7b86e Mon Sep 17 00:00:00 2001 From: "Yuhsiang M. Tsai" Date: Sat, 16 Sep 2023 00:58:00 +0200 Subject: [PATCH 33/48] nvhpc optimization/computation error workaround --- core/test/utils/matrix_generator.hpp | 8 ++++++-- include/ginkgo/core/base/half.hpp | 6 ++++-- omp/factorization/par_ilut_kernels.cpp | 7 ++++++- reference/factorization/par_ilut_kernels.cpp | 7 ++++++- reference/test/stop/residual_norm_kernels.cpp | 4 +++- 5 files changed, 25 insertions(+), 7 deletions(-) diff --git a/core/test/utils/matrix_generator.hpp b/core/test/utils/matrix_generator.hpp index 6928c5424a5..1194f86ef77 100644 --- a/core/test/utils/matrix_generator.hpp +++ b/core/test/utils/matrix_generator.hpp @@ -596,9 +596,13 @@ gko::matrix_data generate_tridiag_inverse_matrix_data( auto off_diag = i < j ? upper : lower; auto min_idx = std::min(i, j); auto max_idx = std::max(i, j); + // TODO: NVHPC requires explicitly casting to single precision + // from half. auto val = sign * - static_cast( - std::pow(off_diag, max_idx - min_idx)) * + static_cast(std::pow( + typename gko::detail::arth_type::type{ + off_diag}, + max_idx - min_idx)) * alpha[min_idx] * beta[max_idx + 1] / alpha.back(); md.nonzeros.emplace_back(i, j, val); } diff --git a/include/ginkgo/core/base/half.hpp b/include/ginkgo/core/base/half.hpp index 218a487e1a4..8df7b14fec9 100644 --- a/include/ginkgo/core/base/half.hpp +++ b/include/ginkgo/core/base/half.hpp @@ -38,7 +38,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include -#include #include #include @@ -327,7 +326,10 @@ struct precision_converter { */ class half { public: - GKO_ATTRIBUTES half() noexcept = default; + // TODO: NVHPC (host side) may not use zero initialzation for the data + // member by default constructor in some cases. Not sure whether it is + // caused by something else in jacobi or isai. + GKO_ATTRIBUTES half() noexcept : data_(0){}; template ::value>> GKO_ATTRIBUTES half(const T val) diff --git a/omp/factorization/par_ilut_kernels.cpp b/omp/factorization/par_ilut_kernels.cpp index b2c443635e8..90f0a243e19 100644 --- a/omp/factorization/par_ilut_kernels.cpp +++ b/omp/factorization/par_ilut_kernels.cpp @@ -213,7 +213,12 @@ void threshold_filter_approx(std::shared_ptr exec, // pick splitters for (IndexType i = 0; i < bucket_count - 1; ++i) { // shift by one so we get upper bounds for the buckets - sample[i] = sample[(i + 1) * sampleselect_oversampling]; + // TODO FIXME: NVHPC 23.3 seems to handle assignment index with + // optimization wrongly on a custom class when IndexType is long. We set + // the index explicitly with volatile to solve it. + // https://godbolt.org/z/srYhGndKn + volatile auto index = (i + 1) * sampleselect_oversampling; + sample[i] = sample[index]; } // count elements per bucket auto total_histogram = reinterpret_cast(sample + bucket_count); diff --git a/reference/factorization/par_ilut_kernels.cpp b/reference/factorization/par_ilut_kernels.cpp index f26da021681..83ada9201ea 100644 --- a/reference/factorization/par_ilut_kernels.cpp +++ b/reference/factorization/par_ilut_kernels.cpp @@ -222,7 +222,12 @@ void threshold_filter_approx(std::shared_ptr exec, // pick splitters for (IndexType i = 0; i < bucket_count - 1; ++i) { // shift by one so we get upper bounds for the buckets - sample[i] = sample[(i + 1) * sampleselect_oversampling]; + // TODO FIXME: NVHPC 23.3 seems to handle assignment index with + // optimization wrongly on a custom class when IndexType is long. We set + // the index explicitly with volatile to solve it. + // https://godbolt.org/z/srYhGndKn + volatile auto index = (i + 1) * sampleselect_oversampling; + sample[i] = sample[index]; } // count elements per bucket auto histogram = reinterpret_cast(sample + bucket_count); diff --git a/reference/test/stop/residual_norm_kernels.cpp b/reference/test/stop/residual_norm_kernels.cpp index 1c18fbb895d..498fea61cb8 100644 --- a/reference/test/stop/residual_norm_kernels.cpp +++ b/reference/test/stop/residual_norm_kernels.cpp @@ -398,7 +398,9 @@ TYPED_TEST(ResidualNorm, SelfCalculatesAndWaitsTillResidualGoal) ASSERT_FALSE(abs_criterion->update().solution(solution).check( RelativeStoppingId, true, &stop_status, &one_changed)); - solution->at(0) = rhs_val - r::value * T{1.2}; + // TODO FIXME: NVHPC calculates different result of rhs - r*1.2 from + // rhs - tmp = rhs - (r * 1.2). https://godbolt.org/z/GrGE9PE67 + solution->at(0) = rhs_val - r::value * T{1.4}; ASSERT_FALSE(abs_criterion->update().solution(solution).check( RelativeStoppingId, true, &stop_status, &one_changed)); ASSERT_EQ(stop_status.get_data()[0].has_converged(), false); From c6bbf20042863b4d87f439fc42229a8e7b6e086e Mon Sep 17 00:00:00 2001 From: "Yuhsiang M. Tsai" Date: Sat, 16 Sep 2023 00:58:14 +0200 Subject: [PATCH 34/48] disable mpi half test --- core/test/utils.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/core/test/utils.hpp b/core/test/utils.hpp index 90ef3d20827..098b6355cee 100644 --- a/core/test/utils.hpp +++ b/core/test/utils.hpp @@ -118,9 +118,9 @@ using LocalGlobalIndexTypes = using PODTypes = #if GINKGO_DPCPP_SINGLE_MODE - ::testing::Types; + ::testing::Types; #else - ::testing::Types; + ::testing::Types; #endif From 18d323ecf10b409d027963069617744f70ed655e Mon Sep 17 00:00:00 2001 From: "Yuhsiang M. Tsai" Date: Fri, 29 Sep 2023 15:57:48 +0200 Subject: [PATCH 35/48] some math func is not defined if nvhpc is for host --- cuda/base/types.hpp | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/cuda/base/types.hpp b/cuda/base/types.hpp index 0986ff38f59..fe4bc40c2de 100644 --- a/cuda/base/types.hpp +++ b/cuda/base/types.hpp @@ -94,6 +94,9 @@ THRUST_HALF_FRIEND_OPERATOR(/, /=) namespace gko { +// It is required by NVHPC 23.3, isnan is undefined when NVHPC are only as host +// compiler. +#ifdef __CUDACC__ // from the cuda_fp16.hpp #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 @@ -126,10 +129,14 @@ __device__ __forceinline__ bool is_nan(const thrust::complex<__half>& val) } +#endif + + namespace kernels { namespace cuda { +#ifdef __CUDACC__ #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 #if CUDA_VERSION >= 10020 @@ -161,7 +168,7 @@ __device__ __forceinline__ __half sqrt(const __half& val) #endif - +#endif namespace detail { From 8b27e3cac4062d3fabf82d07c636b979ff29679b Mon Sep 17 00:00:00 2001 From: "Yuhsiang M. Tsai" Date: Sat, 30 Sep 2023 20:33:02 +0200 Subject: [PATCH 36/48] add half spmv benchmark (with cusparse for cuda) --- benchmark/CMakeLists.txt | 33 ++++++++++++++++++--------- benchmark/run_all_benchmarks.sh | 15 ++++++++++--- benchmark/spmv/CMakeLists.txt | 5 +++++ benchmark/spmv/spmv_common.hpp | 4 +++- benchmark/utils/cuda_linops.cpp | 40 ++++++++++++++++++++------------- benchmark/utils/generator.hpp | 5 +---- benchmark/utils/types.hpp | 7 +++++- 7 files changed, 74 insertions(+), 35 deletions(-) diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt index 50c24955b47..61ff7aeb557 100644 --- a/benchmark/CMakeLists.txt +++ b/benchmark/CMakeLists.txt @@ -76,17 +76,25 @@ function(ginkgo_add_single_benchmark_executable name use_lib_linops macro_def ty target_compile_options("${name}" PRIVATE ${GINKGO_COMPILER_FLAGS}) ginkgo_benchmark_add_tuning_maybe("${name}") if("${use_lib_linops}") - if (GINKGO_BUILD_CUDA) - target_compile_definitions("${name}" PRIVATE HAS_CUDA=1) - target_link_libraries("${name}" cusparse_linops_${type}) - endif() - if (GINKGO_BUILD_HIP) - target_compile_definitions("${name}" PRIVATE HAS_HIP=1) - target_link_libraries("${name}" hipsparse_linops_${type}) - endif() - if (GINKGO_BUILD_DPCPP) - target_compile_definitions("${name}" PRIVATE HAS_DPCPP=1) - target_link_libraries("${name}" onemkl_linops_${type}) + if ("${type}" STREQUAL "h") + # only cuda supports half currently + if (GINKGO_BUILD_CUDA) + target_compile_definitions("${name}" PRIVATE HAS_CUDA=1) + target_link_libraries("${name}" cusparse_linops_${type}) + endif() + else() + if (GINKGO_BUILD_CUDA) + target_compile_definitions("${name}" PRIVATE HAS_CUDA=1) + target_link_libraries("${name}" cusparse_linops_${type}) + endif() + if (GINKGO_BUILD_HIP) + target_compile_definitions("${name}" PRIVATE HAS_HIP=1) + target_link_libraries("${name}" hipsparse_linops_${type}) + endif() + if (GINKGO_BUILD_DPCPP) + target_compile_definitions("${name}" PRIVATE HAS_DPCPP=1) + target_link_libraries("${name}" onemkl_linops_${type}) + endif() endif() endif() endfunction(ginkgo_add_single_benchmark_executable) @@ -116,6 +124,9 @@ if (GINKGO_BUILD_CUDA) ginkgo_benchmark_cusparse_linops(s GKO_BENCHMARK_USE_SINGLE_PRECISION) ginkgo_benchmark_cusparse_linops(z GKO_BENCHMARK_USE_DOUBLE_COMPLEX_PRECISION) ginkgo_benchmark_cusparse_linops(c GKO_BENCHMARK_USE_SINGLE_COMPLEX_PRECISION) + if (GINKGO_ENABLE_HALF) + ginkgo_benchmark_cusparse_linops(h GKO_BENCHMARK_USE_HALF_PRECISION) + endif() add_library(cuda_timer utils/cuda_timer.cpp) target_link_libraries(cuda_timer ginkgo CUDA::cudart) endif() diff --git a/benchmark/run_all_benchmarks.sh b/benchmark/run_all_benchmarks.sh index 2a614a87904..0efc0f0b3c2 100755 --- a/benchmark/run_all_benchmarks.sh +++ b/benchmark/run_all_benchmarks.sh @@ -110,6 +110,8 @@ elif [ "${BENCHMARK_PRECISION}" == "dcomplex" ]; then BENCH_SUFFIX="_dcomplex" elif [ "${BENCHMARK_PRECISION}" == "scomplex" ]; then BENCH_SUFFIX="_scomplex" +elif [ "${BENCHMARK_PRECISION}" == "half" ]; then + BENCH_SUFFIX="_half" else echo "BENCHMARK_PRECISION is set to the not supported \"${BENCHMARK_PRECISION}\"." 1>&2 echo "Currently supported values: \"double\", \"single\", \"dcomplex\" and \"scomplex\"" 1>&2 @@ -216,9 +218,16 @@ keep_latest() { compute_matrix_statistics() { [ "${DRY_RUN}" == "true" ] && return cp "$1" "$1.imd" # make sure we're not loosing the original input - ./matrix_statistics/matrix_statistics${BENCH_SUFFIX} \ - --backup="$1.bkp" --double_buffer="$1.bkp2" \ - <"$1.imd" 2>&1 >"$1" + if [ "${BENCH_SUFFIX}" == "_half" ]; then + # half precision benchmark still uses single for statistics + ./matrix_statistics/matrix_statistics_single \ + --backup="$1.bkp" --double_buffer="$1.bkp2" \ + <"$1.imd" 2>&1 >"$1" + else + ./matrix_statistics/matrix_statistics${BENCH_SUFFIX} \ + --backup="$1.bkp" --double_buffer="$1.bkp2" \ + <"$1.imd" 2>&1 >"$1" + fi keep_latest "$1" "$1.bkp" "$1.bkp2" "$1.imd" } diff --git a/benchmark/spmv/CMakeLists.txt b/benchmark/spmv/CMakeLists.txt index 1e3bab1c884..0165d96a264 100644 --- a/benchmark/spmv/CMakeLists.txt +++ b/benchmark/spmv/CMakeLists.txt @@ -1,4 +1,9 @@ ginkgo_add_typed_benchmark_executables(spmv "YES" spmv.cpp) +# TODO: move to all benchmark +if (GINKGO_ENABLE_HALF) + ginkgo_add_single_benchmark_executable( + "spmv_half" "YES" "GKO_BENCHMARK_USE_HALF_PRECISION" "h" spmv.cpp) +endif() if(GINKGO_BUILD_MPI) add_subdirectory(distributed) endif() diff --git a/benchmark/spmv/spmv_common.hpp b/benchmark/spmv/spmv_common.hpp index c85642bb5f1..1f8bf590703 100644 --- a/benchmark/spmv/spmv_common.hpp +++ b/benchmark/spmv/spmv_common.hpp @@ -144,7 +144,9 @@ struct SpmvBenchmark : Benchmark> { exec->synchronize(); auto max_relative_norm2 = compute_max_relative_norm2(x_clone.get(), state.answer.get()); - format_case["max_relative_norm2"] = max_relative_norm2; + format_case["max_relative_norm2"] = + static_cast::type>( + max_relative_norm2); } IterationControl ic{timer}; diff --git a/benchmark/utils/cuda_linops.cpp b/benchmark/utils/cuda_linops.cpp index e2221614d9c..77c8d1f2f5c 100644 --- a/benchmark/utils/cuda_linops.cpp +++ b/benchmark/utils/cuda_linops.cpp @@ -558,14 +558,19 @@ class CusparseHybrid ((CUDA_VERSION >= 10020) && !(defined(_WIN32) || defined(__CYGWIN__))) +// cuSPARSE does not support 16 bit compute for full 16 bit floating point +// input. Also, the scalar must be the compute type, i.e. float. template -void cusparse_generic_spmv(std::shared_ptr gpu_exec, - const cusparseSpMatDescr_t mat, - const gko::array& scalars, - const gko::LinOp* b, gko::LinOp* x, - cusparseOperation_t trans, cusparseSpMVAlg_t alg) +void cusparse_generic_spmv( + std::shared_ptr gpu_exec, + const cusparseSpMatDescr_t mat, + const gko::array::type>& scalars, + const gko::LinOp* b, gko::LinOp* x, cusparseOperation_t trans, + cusparseSpMVAlg_t alg) { cudaDataType_t cu_value = gko::kernels::cuda::cuda_data_type(); + cudaDataType_t compute_value = gko::kernels::cuda::cuda_data_type< + typename gko::detail::arth_type::type>(); using gko::kernels::cuda::as_culibs_type; auto dense_b = gko::as>(b); auto dense_x = gko::as>(x); @@ -584,13 +589,14 @@ void cusparse_generic_spmv(std::shared_ptr gpu_exec, gko::size_type buffer_size = 0; GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseSpMV_bufferSize( gpu_exec->get_cusparse_handle(), trans, &scalars.get_const_data()[0], - mat, vecb, &scalars.get_const_data()[1], vecx, cu_value, alg, + mat, vecb, &scalars.get_const_data()[1], vecx, compute_value, alg, &buffer_size)); gko::array buffer_array(gpu_exec, buffer_size); auto dbuffer = buffer_array.get_data(); GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseSpMV( gpu_exec->get_cusparse_handle(), trans, &scalars.get_const_data()[0], - mat, vecb, &scalars.get_const_data()[1], vecx, cu_value, alg, dbuffer)); + mat, vecb, &scalars.get_const_data()[1], vecx, compute_value, alg, + dbuffer)); GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseDestroyDnVec(vecx)); GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseDestroyDnVec(vecb)); } @@ -669,8 +675,8 @@ class CusparseGenericCsr protected: void apply_impl(const gko::LinOp* b, gko::LinOp* x) const override { - cusparse_generic_spmv(this->get_gpu_exec(), mat_, scalars, b, x, trans_, - Alg); + cusparse_generic_spmv(this->get_gpu_exec(), mat_, scalars, b, + x, trans_, Alg); } void apply_impl(const gko::LinOp* alpha, const gko::LinOp* b, @@ -686,9 +692,11 @@ class CusparseGenericCsr {} private: + using compute_type = typename gko::detail::arth_type::type; // Contains {alpha, beta} - gko::array scalars{ - this->get_executor(), {gko::one(), gko::zero()}}; + gko::array scalars{ + this->get_executor(), + {gko::one(), gko::zero()}}; std::shared_ptr csr_; cusparseOperation_t trans_; cusparseSpMatDescr_t mat_; @@ -761,8 +769,8 @@ class CusparseGenericCoo protected: void apply_impl(const gko::LinOp* b, gko::LinOp* x) const override { - cusparse_generic_spmv(this->get_gpu_exec(), mat_, scalars, b, x, trans_, - default_csr_alg); + cusparse_generic_spmv(this->get_gpu_exec(), mat_, scalars, b, + x, trans_, default_csr_alg); } void apply_impl(const gko::LinOp* alpha, const gko::LinOp* b, @@ -777,9 +785,11 @@ class CusparseGenericCoo {} private: + using compute_type = typename gko::detail::arth_type::type; // Contains {alpha, beta} - gko::array scalars{ - this->get_executor(), {gko::one(), gko::zero()}}; + gko::array scalars{ + this->get_executor(), + {gko::one(), gko::zero()}}; std::shared_ptr coo_; cusparseOperation_t trans_; cusparseSpMatDescr_t mat_; diff --git a/benchmark/utils/generator.hpp b/benchmark/utils/generator.hpp index 3f26ed3f2fc..ad39ac28d85 100644 --- a/benchmark/utils/generator.hpp +++ b/benchmark/utils/generator.hpp @@ -158,10 +158,7 @@ struct DefaultSystemGenerator { { auto res = Vec::create(exec); res->read(gko::matrix_data( - size, - std::uniform_real_distribution>(-1.0, - 1.0), - get_engine())); + size, std::uniform_real_distribution<>(-1.0, 1.0), get_engine())); return res; } diff --git a/benchmark/utils/types.hpp b/benchmark/utils/types.hpp index acd0c6cb8a2..fa79bea3801 100644 --- a/benchmark/utils/types.hpp +++ b/benchmark/utils/types.hpp @@ -46,7 +46,8 @@ using itype = gko::int32; #if defined(GKO_BENCHMARK_USE_DOUBLE_PRECISION) || \ defined(GKO_BENCHMARK_USE_SINGLE_PRECISION) || \ defined(GKO_BENCHMARK_USE_DOUBLE_COMPLEX_PRECISION) || \ - defined(GKO_BENCHMARK_USE_SINGLE_COMPLEX_PRECISION) + defined(GKO_BENCHMARK_USE_SINGLE_COMPLEX_PRECISION) || \ + defined(GKO_BENCHMARK_USE_HALF_PRECISION) // separate ifdefs to catch duplicate definitions #ifdef GKO_BENCHMARK_USE_DOUBLE_PRECISION using etype = double; @@ -60,6 +61,10 @@ using etype = std::complex; #ifdef GKO_BENCHMARK_USE_SINGLE_COMPLEX_PRECISION using etype = std::complex; #endif +#ifdef GKO_BENCHMARK_USE_HALF_PRECISION +#include +using etype = gko::half; +#endif #else // default to double precision using etype = double; #endif From 4eb3b53bcce039d79edcff934a9a123bf3df37d9 Mon Sep 17 00:00:00 2001 From: "Yu-Hsiang M. Tsai" Date: Thu, 7 Sep 2023 10:47:35 +0200 Subject: [PATCH 37/48] add bfloat16 --- core/test/base/CMakeLists.txt | 1 + core/test/base/extended_bfloat16.cpp | 331 ++++++++++++++++++++++ include/ginkgo/core/base/half.hpp | 401 ++++++++++++++++++++++++++- include/ginkgo/core/base/types.hpp | 1 + 4 files changed, 732 insertions(+), 2 deletions(-) create mode 100644 core/test/base/extended_bfloat16.cpp diff --git a/core/test/base/CMakeLists.txt b/core/test/base/CMakeLists.txt index 36bad656b07..200c181e513 100644 --- a/core/test/base/CMakeLists.txt +++ b/core/test/base/CMakeLists.txt @@ -10,6 +10,7 @@ ginkgo_create_test(dim) ginkgo_create_test(exception) ginkgo_create_test(exception_helpers) ginkgo_create_test(extended_float) +ginkgo_create_test(extended_bfloat16) ginkgo_create_test(executor) ginkgo_create_test(iterator_factory) ginkgo_create_test(lin_op) diff --git a/core/test/base/extended_bfloat16.cpp b/core/test/base/extended_bfloat16.cpp new file mode 100644 index 00000000000..280b7947f8a --- /dev/null +++ b/core/test/base/extended_bfloat16.cpp @@ -0,0 +1,331 @@ +/************************************************************* +Copyright (c) 2017-2023, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include "core/base/extended_float.hpp" + + +#include +#include +#include + + +#include + + +#include + +namespace { + + +template +struct floating_impl; + +template <> +struct floating_impl<16> { + using type = gko::bfloat16; +}; + +template <> +struct floating_impl<32> { + using type = float; +}; + +template <> +struct floating_impl<64> { + using type = double; +}; + +template +using floating = typename floating_impl::type; + + +class ExtendedFloatTestBase : public ::testing::Test { +protected: + using bfloat16 = gko::bfloat16; + template + using truncated = gko::truncated; + + static constexpr auto byte_size = gko::byte_size; + + template + static floating create_from_bits(const char (&s)[N]) + { + auto bits = std::bitset(s).to_ullong(); + return reinterpret_cast&>(bits); + } + + template + static std::bitset get_bits(T val) + { + auto bits = + reinterpret_cast::bits_type&>( + val); + return std::bitset(bits); + } + + template + static std::bitset get_bits(const char (&s)[N]) + { + return std::bitset(s); + } +}; + + +class FloatToBFloat16 : public ExtendedFloatTestBase {}; + + +// clang-format does terrible formatting of string literal concatenation +// clang-format off + + +TEST_F(FloatToBFloat16, ConvertsOne) +{ + bfloat16 x = create_from_bits("0" "01111111" "00000000000000000000000"); + + ASSERT_EQ(get_bits(x), get_bits("0" "01111111" "0000000")); +} + + +TEST_F(FloatToBFloat16, ConvertsZero) +{ + bfloat16 x = create_from_bits("0" "00000000" "00000000000000000000000"); + + ASSERT_EQ(get_bits(x), get_bits("0" "00000000" "0000000")); +} + + +TEST_F(FloatToBFloat16, ConvertsInf) +{ + bfloat16 x = create_from_bits("0" "11111111" "00000000000000000000000"); + + ASSERT_EQ(get_bits(x), get_bits("0" "11111111" "0000000")); +} + + +TEST_F(FloatToBFloat16, ConvertsNegInf) +{ + bfloat16 x = create_from_bits("1" "11111111" "00000000000000000000000"); + + ASSERT_EQ(get_bits(x), get_bits("1" "11111111" "0000000")); +} + + +TEST_F(FloatToBFloat16, ConvertsNan) +{ + bfloat16 x = create_from_bits("0" "11111111" "00000000000000000000001"); + + #if defined(SYCL_LANGUAGE_VERSION) && \ + (__LIBSYCL_MAJOR_VERSION > 5 || (__LIBSYCL_MAJOR_VERSION == 5 && __LIBSYCL_MINOR_VERSION >= 7)) + // Sycl put the 1000000000, but ours put mask + ASSERT_EQ(get_bits(x), get_bits("0" "11111111" "1000000")); + #else + ASSERT_EQ(get_bits(x), get_bits("0" "11111111" "1111111")); + #endif +} + + +TEST_F(FloatToBFloat16, ConvertsNegNan) +{ + bfloat16 x = create_from_bits("1" "11111111" "00010000000000000000000"); + + #if defined(SYCL_LANGUAGE_VERSION) && \ + (__LIBSYCL_MAJOR_VERSION > 5 || (__LIBSYCL_MAJOR_VERSION == 5 && __LIBSYCL_MINOR_VERSION >= 7)) + // Sycl put the 1000000000, but ours put mask + ASSERT_EQ(get_bits(x), get_bits("1" "11111111" "1000000")); + #else + ASSERT_EQ(get_bits(x), get_bits("1" "11111111" "1111111")); + #endif +} + + +TEST_F(FloatToBFloat16, FlushesToZero) +{ + bfloat16 x = create_from_bits("0" "00000000" "00000000000100000001000"); + + ASSERT_EQ(get_bits(x), get_bits("0" "00000000" "0000000")); +} + + +TEST_F(FloatToBFloat16, FlushesToNegZero) +{ + bfloat16 x = create_from_bits("1" "00000000" "00000000000100000001000"); + + ASSERT_EQ(get_bits(x), get_bits("1" "00000000" "0000000")); +} + + +TEST_F(FloatToBFloat16, FlushesToInf) +{ + bfloat16 x = create_from_bits("0" "11111110" "11111111111111111111111"); + + ASSERT_EQ(get_bits(x), get_bits("0" "11111111" "0000000")); +} + + +TEST_F(FloatToBFloat16, FlushesToNegInf) +{ + bfloat16 x = create_from_bits("1" "11111110" "11111111111111111111111"); + + ASSERT_EQ(get_bits(x), get_bits("1" "11111111" "0000000")); +} + + +TEST_F(FloatToBFloat16, TruncatesSmallNumber) +{ + bfloat16 x = create_from_bits("0" "01110001" "10010000000000010000100"); + + ASSERT_EQ(get_bits(x), get_bits("0" "01110001" "1001000")); +} + + +TEST_F(FloatToBFloat16, TruncatesLargeNumberRoundToEven) +{ + bfloat16 neg_x = create_from_bits("1" "10001110" "10010111111000010000100"); + bfloat16 neg_x2 = create_from_bits("1" "10001110" "10010101111000010000100"); + bfloat16 x = create_from_bits("0" "10001110" "10010111111000010000100"); + bfloat16 x2 = create_from_bits("0" "10001110" "10010101111000010000100"); + bfloat16 x3 = create_from_bits("0" "10001110" "10010101000000000000000"); + bfloat16 x4 = create_from_bits("0" "10001110" "10010111000000000000000"); + + EXPECT_EQ(get_bits(x), get_bits("0" "10001110" "1001100")); + EXPECT_EQ(get_bits(x2), get_bits("0" "10001110" "1001011")); + EXPECT_EQ(get_bits(x3), get_bits("0" "10001110" "1001010")); + EXPECT_EQ(get_bits(x4), get_bits("0" "10001110" "1001100")); + EXPECT_EQ(get_bits(neg_x), get_bits("1" "10001110" "1001100")); + EXPECT_EQ(get_bits(neg_x2), get_bits("1" "10001110" "1001011")); +} + + +TEST_F(FloatToBFloat16, Convert) +{ + float rho = 86.25; + float beta = 1110; + auto float_res = rho/beta; + gko::bfloat16 rho_h = rho; + gko::bfloat16 beta_h = beta; + auto bfloat16_res = rho_h/beta_h; + std::cout << float_res << std::endl; + std::cout << float(bfloat16_res) << std::endl; + + std::complex cpx{100.0, 0.0}; + std::cout << float(gko::squared_norm(cpx)) << std::endl; +} + +// clang-format on + + +class bfloat16ToFloat : public ExtendedFloatTestBase {}; + + +// clang-format off + + +TEST_F(bfloat16ToFloat, ConvertsOne) +{ + float x = create_from_bits("0" "01111111" "0000000"); + + ASSERT_EQ(get_bits(x), get_bits("0" "01111111" "00000000000000000000000")); +} + + +TEST_F(bfloat16ToFloat, ConvertsZero) +{ + float x = create_from_bits("0" "00000000" "0000000"); + + ASSERT_EQ(get_bits(x), get_bits("0" "00000000" "00000000000000000000000")); +} + + +TEST_F(bfloat16ToFloat, ConvertsInf) +{ + float x = create_from_bits("0" "11111111" "0000000"); + + ASSERT_EQ(get_bits(x), get_bits("0" "11111111" "00000000000000000000000")); +} + + +TEST_F(bfloat16ToFloat, ConvertsNegInf) +{ + float x = create_from_bits("1" "11111111" "0000000"); + + ASSERT_EQ(get_bits(x), get_bits("1" "11111111" "00000000000000000000000")); +} + + +TEST_F(bfloat16ToFloat, ConvertsNan) +{ + float x = create_from_bits("0" "11111111" "0001001"); + + #if defined(SYCL_LANGUAGE_VERSION) && \ + (__LIBSYCL_MAJOR_VERSION > 5 || (__LIBSYCL_MAJOR_VERSION == 5 && __LIBSYCL_MINOR_VERSION >= 7)) + // sycl keeps significand + ASSERT_EQ(get_bits(x), get_bits("0" "11111111" "00010010000000000000000")); + #else + ASSERT_EQ(get_bits(x), get_bits("0" "11111111" "11111111111111111111111")); + #endif +} + + +TEST_F(bfloat16ToFloat, ConvertsNegNan) +{ + float x = create_from_bits("1" "11111111" "0000001"); + + #if defined(SYCL_LANGUAGE_VERSION) && \ + (__LIBSYCL_MAJOR_VERSION > 5 || (__LIBSYCL_MAJOR_VERSION == 5 && __LIBSYCL_MINOR_VERSION >= 7)) + // sycl keeps significand + ASSERT_EQ(get_bits(x), get_bits("1" "11111111" "00000010000000000000000")); + #else + ASSERT_EQ(get_bits(x), get_bits("1" "11111111" "11111111111111111111111")); + #endif +} + + +TEST_F(bfloat16ToFloat, ExtendsSmallNumber) +{ + float x = create_from_bits("0" "01110001" "1000010"); + + ASSERT_EQ(get_bits(x), get_bits("0" "01110001" "10000100000000000000000")); +} + + +TEST_F(bfloat16ToFloat, ExtendsLargeNumber) +{ + float x = create_from_bits("1" "10001110" "1001001"); + + ASSERT_EQ(get_bits(x), get_bits("1" "10001110" "10010010000000000000000")); +} + + +// clang-format on + + +} // namespace diff --git a/include/ginkgo/core/base/half.hpp b/include/ginkgo/core/base/half.hpp index 8df7b14fec9..c75cfd89dcf 100644 --- a/include/ginkgo/core/base/half.hpp +++ b/include/ginkgo/core/base/half.hpp @@ -45,19 +45,24 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #ifdef __CUDA_ARCH__ +#include #include +class hip_bfloat16; #elif defined(__HIP_DEVICE_COMPILE__) +#include #include - +class __nv_bfloat16; #else class __half; +class __nv_bfloat16; +class hip_bfloat16; #endif // __CUDA_ARCH__ @@ -107,6 +112,15 @@ struct basic_float_traits { static constexpr bool rounds_to_nearest = true; }; +template <> +struct basic_float_traits { + using type = bfloat16; + static constexpr int sign_bits = 1; + static constexpr int significand_bits = 7; + static constexpr int exponent_bits = 8; + static constexpr bool rounds_to_nearest = true; +}; + // #if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) template <> struct basic_float_traits<__half> { @@ -116,6 +130,24 @@ struct basic_float_traits<__half> { static constexpr int exponent_bits = 5; static constexpr bool rounds_to_nearest = true; }; + +template <> +struct basic_float_traits<__nv_bfloat16> { + using type = __nv_bfloat16; + static constexpr int sign_bits = 1; + static constexpr int significand_bits = 7; + static constexpr int exponent_bits = 8; + static constexpr bool rounds_to_nearest = true; +}; + +template <> +struct basic_float_traits { + using type = hip_bfloat16; + static constexpr int sign_bits = 1; + static constexpr int significand_bits = 7; + static constexpr int exponent_bits = 8; + static constexpr bool rounds_to_nearest = true; +}; // #endif template <> @@ -365,7 +397,7 @@ class half { return static_cast(static_cast(lhf) \ _op static_cast(rhf)); \ } \ - GKO_ATTRIBUTES half& operator _opeq(const half& hf) \ + GKO_ATTRIBUTES half& operator _opeq(const half & hf) \ { \ auto result = *this _op hf; \ this->float2half(result); \ @@ -493,6 +525,161 @@ class half { }; +class bfloat16 { +public: + GKO_ATTRIBUTES bfloat16() noexcept = default; + + template ::value>> + GKO_ATTRIBUTES bfloat16(const T val) + { + this->float2bfloat16(static_cast(val)); + } + + GKO_ATTRIBUTES bfloat16(const bfloat16& val) = default; + + template + GKO_ATTRIBUTES bfloat16& operator=(const V val) + { + this->float2bfloat16(static_cast(val)); + return *this; + } + + GKO_ATTRIBUTES operator float() const noexcept + { + const auto bits = bfloat162float(data_); + return reinterpret_cast(bits); + } + + // can not use bfloat16 operator _op(const bfloat16) for bfloat16 + bfloat16 + // operation will cast it to float and then do float operation such that it + // becomes float in the end. +#define bfloat16_OPERATOR(_op, _opeq) \ + GKO_ATTRIBUTES friend bfloat16 operator _op(const bfloat16 lhf, \ + const bfloat16 rhf) \ + { \ + return static_cast(static_cast(lhf) \ + _op static_cast(rhf)); \ + } \ + GKO_ATTRIBUTES bfloat16& operator _opeq(const bfloat16 & hf) \ + { \ + auto result = *this _op hf; \ + this->float2bfloat16(result); \ + return *this; \ + } + bfloat16_OPERATOR(+, +=) bfloat16_OPERATOR(-, -=) bfloat16_OPERATOR(*, *=) + bfloat16_OPERATOR(/, /=) + + // Do operation with different type + // If it is floating point, using floating point as type. + // If it is integer, using bfloat16 as type +#define bfloat16_FRIEND_OPERATOR(_op, _opeq) \ + template \ + GKO_ATTRIBUTES friend std::enable_if_t< \ + !std::is_same::value && std::is_scalar::value, \ + typename std::conditional::value, T, \ + bfloat16>::type> \ + operator _op(const bfloat16 hf, const T val) \ + { \ + using type = \ + typename std::conditional::value, T, \ + bfloat16>::type; \ + auto result = static_cast(hf); \ + result _opeq static_cast(val); \ + return result; \ + } \ + template \ + GKO_ATTRIBUTES friend std::enable_if_t< \ + !std::is_same::value && std::is_scalar::value, \ + typename std::conditional::value, T, \ + bfloat16>::type> \ + operator _op(const T val, const bfloat16 hf) \ + { \ + using type = \ + typename std::conditional::value, T, \ + bfloat16>::type; \ + auto result = static_cast(val); \ + result _opeq static_cast(hf); \ + return result; \ + } + + bfloat16_FRIEND_OPERATOR(+, +=) bfloat16_FRIEND_OPERATOR(-, -=) + bfloat16_FRIEND_OPERATOR(*, *=) bfloat16_FRIEND_OPERATOR(/, /=) + + // the negative + GKO_ATTRIBUTES bfloat16 + operator-() const + { + auto val = 0.0f - *this; + return bfloat16(val); + } + +private: + using f16_traits = detail::float_traits; + using f32_traits = detail::float_traits; + + // TODO: do we really need this one? + // Without it, everything can be constexpr, which might make stuff easier. + GKO_ATTRIBUTES void float2bfloat16(float val) noexcept + { + data_ = float2bfloat16(reinterpret_cast(val)); + } + + static GKO_ATTRIBUTES uint16 float2bfloat16(uint32 data_) noexcept + { + using conv = detail::precision_converter; + if (f32_traits::is_inf(data_)) { + return conv::shift_sign(data_) | f16_traits::exponent_mask; + } else if (f32_traits::is_nan(data_)) { + return conv::shift_sign(data_) | f16_traits::exponent_mask | + f16_traits::significand_mask; + } else { + const auto exp = conv::shift_exponent(data_); + if (f16_traits::is_inf(exp)) { + return conv::shift_sign(data_) | exp; + } else if (f16_traits::is_denom(exp)) { + // TODO: handle denormals + return conv::shift_sign(data_); + } else { + // Rounding to even + const auto result = conv::shift_sign(data_) | exp | + conv::shift_significand(data_); + // return result + ((result & 1) && + // ((data_ >> (f32_traits::significand_bits - + // f16_traits::significand_bits - 1)) & + // 1)); + const auto tail = + data_ & static_cast( + (1 << conv::significand_offset) - 1); + + constexpr auto bfloat16 = static_cast( + 1 << (conv::significand_offset - 1)); + return result + (tail > bfloat16 || + ((tail == bfloat16) && (result & 1))); + } + } + } + + static GKO_ATTRIBUTES uint32 bfloat162float(uint16 data_) noexcept + { + using conv = detail::precision_converter; + if (f16_traits::is_inf(data_)) { + return conv::shift_sign(data_) | f32_traits::exponent_mask; + } else if (f16_traits::is_nan(data_)) { + return conv::shift_sign(data_) | f32_traits::exponent_mask | + f32_traits::significand_mask; + } else if (f16_traits::is_denom(data_)) { + // TODO: handle denormals + return conv::shift_sign(data_); + } else { + return conv::shift_sign(data_) | conv::shift_exponent(data_) | + conv::shift_significand(data_); + } + } + + uint16 data_; +}; + + } // namespace gko @@ -652,6 +839,161 @@ class complex { value_type imag_; }; + +template <> +class complex { +public: + using value_type = gko::bfloat16; + + complex(const value_type& real = value_type(0.f), + const value_type& imag = value_type(0.f)) + : real_(real), imag_(imag) + {} + template ::value && + std::is_scalar::value>> + explicit complex(const T& real, const U& imag) + : real_(static_cast(real)), + imag_(static_cast(imag)) + {} + + template ::value>> + complex(const T& real) + : real_(static_cast(real)), + imag_(static_cast(0.f)) + {} + + // When using complex(real, imag), MSVC with CUDA try to recognize the + // complex is a member not constructor. + template ::value>> + explicit complex(const complex& other) + : real_(static_cast(other.real())), + imag_(static_cast(other.imag())) + {} + + // explicit complex(const complex& other) = default; + + value_type real() const noexcept { return real_; } + + value_type imag() const noexcept { return imag_; } + + + operator std::complex() const noexcept + { + return std::complex(static_cast(real_), + static_cast(imag_)); + } + + // operator std::complex() const noexcept + // { + // return std::complex(static_cast(real_), + // static_cast(imag_)); + // } + + template + complex& operator=(const V& val) + { + real_ = val; + imag_ = value_type(); + return *this; + } + + template + complex& operator=(const std::complex& val) + { + real_ = val.real(); + imag_ = val.imag(); + return *this; + } + + complex& operator+=(const value_type& real) + { + real_ += real; + return *this; + } + complex& operator-=(const value_type& real) + { + real_ -= real; + return *this; + } + complex& operator*=(const value_type& real) + { + real_ *= real; + imag_ *= real; + return *this; + } + complex& operator/=(const value_type& real) + { + real_ /= real; + imag_ /= real; + return *this; + } + + template + complex& operator+=(const complex& val) + { + real_ += val.real(); + imag_ += val.imag(); + return *this; + } + template + complex& operator-=(const complex& val) + { + real_ -= val.real(); + imag_ -= val.imag(); + return *this; + } + template + complex& operator*=(const complex& val) + { + auto val_f = static_cast>(val); + auto result_f = static_cast>(*this); + result_f *= val_f; + real_ = result_f.real(); + imag_ = result_f.imag(); + // auto tmp = real_; + // real_ = real_ * val.real() - imag_ * val.imag(); + // imag_ = tmp * val.imag() + imag_ * val.real(); + return *this; + } + template + complex& operator/=(const complex& val) + { + // auto real = val.real(); + // auto imag = val.imag(); + // (*this) *= complex{val.real(), -val.imag()}; + // (*this) /= (real * real + imag * imag); + auto val_f = static_cast>(val); + auto result_f = static_cast>(*this); + result_f /= val_f; + real_ = result_f.real(); + imag_ = result_f.imag(); + return *this; + } + +// It's for MacOS. +// TODO: check whether mac compiler always use complex version even when real +// bfloat16 +#define COMPLEX_BFLOAT16_OPERATOR(_op, _opeq) \ + GKO_ATTRIBUTES friend complex operator _op( \ + const complex lhf, const complex rhf) \ + { \ + auto a = lhf; \ + a _opeq rhf; \ + return a; \ + } + + COMPLEX_BFLOAT16_OPERATOR(+, +=) + COMPLEX_BFLOAT16_OPERATOR(-, -=) + COMPLEX_BFLOAT16_OPERATOR(*, *=) + COMPLEX_BFLOAT16_OPERATOR(/, /=) + +private: + value_type real_; + value_type imag_; +}; + + template <> struct numeric_limits { static constexpr bool is_specialized{true}; @@ -688,6 +1030,43 @@ struct numeric_limits { }; +template <> +struct numeric_limits { + static constexpr bool is_specialized{true}; + static constexpr bool is_signed{true}; + static constexpr bool is_integer{false}; + static constexpr bool is_exact{false}; + static constexpr bool is_bounded{true}; + static constexpr bool is_modulo{false}; + static constexpr int digits{ + gko::detail::float_traits::significand_bits + 1}; + // 3/10 is approx. log_10(2) + static constexpr int digits10{digits * 3 / 10}; + + // Note: gko::bfloat16 can't return gko::bfloat16 here because it does not + // have + // a constexpr constructor. + static constexpr float epsilon() + { + return gko::detail::float_traits::eps; + } + + static constexpr float infinity() + { + return numeric_limits::infinity(); + } + + static constexpr float min() { return numeric_limits::min(); } + + static constexpr float max() { return numeric_limits::max(); } + + static constexpr float quiet_NaN() + { + return numeric_limits::quiet_NaN(); + } +}; + + // complex using a template on operator= for any kind of complex, so we can // do full specialization for half template <> @@ -699,6 +1078,15 @@ inline complex& complex::operator=( return *this; } +template <> +inline complex& complex::operator=( + const std::complex& a) +{ + complex t(a.real(), a.imag()); + operator=(t); + return *this; +} + // For MSVC template <> @@ -710,6 +1098,15 @@ inline complex& complex::operator=( return *this; } +template <> +inline complex& complex::operator=( + const std::complex& a) +{ + complex t(a.real(), a.imag()); + operator=(t); + return *this; +} + } // namespace std diff --git a/include/ginkgo/core/base/types.hpp b/include/ginkgo/core/base/types.hpp index bb345207199..c51e52846aa 100644 --- a/include/ginkgo/core/base/types.hpp +++ b/include/ginkgo/core/base/types.hpp @@ -165,6 +165,7 @@ using uintptr = std::uintptr_t; // using half = sycl::half; // #else class half; +class bfloat16; // #endif From f3913a24dcd467ac4473b7aba641545d34ad96f5 Mon Sep 17 00:00:00 2001 From: "Yu-Hsiang M. Tsai" Date: Mon, 11 Sep 2023 16:16:20 +0200 Subject: [PATCH 38/48] this can be compiled after cuda arch 80 --- accessor/cuda_helper.hpp | 5 + accessor/hip_helper.hpp | 5 + common/cuda_hip/base/math.hpp.inc | 15 ++ common/cuda_hip/components/volatile.hpp.inc | 4 +- .../precision_conversion_kernels.cpp | 12 +- .../unified/matrix/dense_kernels.template.cpp | 4 +- core/base/batch_multi_vector.cpp | 17 ++ core/base/mixed_precision_types.hpp | 150 +++++++++++- core/base/mtx_io.cpp | 11 +- core/distributed/matrix.cpp | 39 ++++ core/distributed/vector.cpp | 18 ++ core/matrix/coo.cpp | 19 ++ core/matrix/csr.cpp | 20 ++ core/matrix/dense.cpp | 23 ++ core/matrix/diagonal.cpp | 16 ++ core/matrix/ell.cpp | 20 ++ core/matrix/fbcsr.cpp | 21 ++ core/matrix/hybrid.cpp | 21 ++ core/matrix/row_gatherer.cpp | 6 +- core/matrix/sellp.cpp | 22 ++ core/solver/multigrid.cpp | 28 +-- core/test/base/extended_bfloat16.cpp | 6 +- core/test/utils.hpp | 5 + cuda/base/batch_multi_vector_kernels.cu | 3 + cuda/base/types.cpp | 33 +++ cuda/base/types.hpp | 129 ++++++++++ cuda/factorization/cholesky_kernels.cu | 1 + .../ginkgo/core/base/batch_multi_vector.hpp | 10 + include/ginkgo/core/base/half.hpp | 38 ++- include/ginkgo/core/base/math.hpp | 157 ++++++++++++- include/ginkgo/core/base/mpi.hpp | 2 + include/ginkgo/core/base/types.hpp | 221 +++++++++++------- include/ginkgo/core/distributed/matrix.hpp | 14 ++ include/ginkgo/core/distributed/vector.hpp | 20 ++ include/ginkgo/core/matrix/coo.hpp | 12 + include/ginkgo/core/matrix/csr.hpp | 13 ++ include/ginkgo/core/matrix/dense.hpp | 10 + include/ginkgo/core/matrix/diagonal.hpp | 10 + include/ginkgo/core/matrix/ell.hpp | 12 + include/ginkgo/core/matrix/fbcsr.hpp | 42 ++-- include/ginkgo/core/matrix/hybrid.hpp | 13 ++ include/ginkgo/core/matrix/sellp.hpp | 38 ++- omp/components/atomic.hpp | 20 ++ 43 files changed, 1154 insertions(+), 131 deletions(-) create mode 100644 cuda/base/types.cpp diff --git a/accessor/cuda_helper.hpp b/accessor/cuda_helper.hpp index d11b934e90e..0167ee0f9c4 100644 --- a/accessor/cuda_helper.hpp +++ b/accessor/cuda_helper.hpp @@ -65,6 +65,11 @@ struct cuda_type { using type = __half; }; +template <> +struct cuda_type { + using type = __nv_bfloat16; +}; + // Unpack cv and reference / pointer qualifiers template struct cuda_type { diff --git a/accessor/hip_helper.hpp b/accessor/hip_helper.hpp index 5feaa45400b..4b1054907ba 100644 --- a/accessor/hip_helper.hpp +++ b/accessor/hip_helper.hpp @@ -91,6 +91,11 @@ struct hip_type { using type = __half; }; +template <> +struct hip_type { + using type = hip_bfloat16; +}; + // Transform std::complex to thrust::complex template struct hip_type> { diff --git a/common/cuda_hip/base/math.hpp.inc b/common/cuda_hip/base/math.hpp.inc index 39ab5e8baf5..bea153dbeda 100644 --- a/common/cuda_hip/base/math.hpp.inc +++ b/common/cuda_hip/base/math.hpp.inc @@ -47,6 +47,21 @@ struct device_numeric_limits<__half> { }; +template <> +struct device_numeric_limits<__nv_bfloat16> { + static constexpr auto inf = std::numeric_limits::infinity(); + static constexpr auto max = std::numeric_limits::max(); + static constexpr auto min = std::numeric_limits::min(); +}; + + +template <> +struct device_numeric_limits { + static constexpr auto inf = std::numeric_limits::infinity(); + static constexpr auto max = std::numeric_limits::max(); + static constexpr auto min = std::numeric_limits::min(); +}; + namespace detail { diff --git a/common/cuda_hip/components/volatile.hpp.inc b/common/cuda_hip/components/volatile.hpp.inc index d9c56c71238..75b586a8508 100644 --- a/common/cuda_hip/components/volatile.hpp.inc +++ b/common/cuda_hip/components/volatile.hpp.inc @@ -42,7 +42,9 @@ __device__ __forceinline__ template __device__ __forceinline__ std::enable_if_t::value || - std::is_same::value, + std::is_same::value || + std::is_same::value || + std::is_same::value, thrust::complex> load(const thrust::complex* values, IndexType index) { diff --git a/common/unified/components/precision_conversion_kernels.cpp b/common/unified/components/precision_conversion_kernels.cpp index 47ddf7bfc92..222e9471f96 100644 --- a/common/unified/components/precision_conversion_kernels.cpp +++ b/common/unified/components/precision_conversion_kernels.cpp @@ -36,6 +36,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common/unified/base/kernel_launch.hpp" +#include +#include + namespace gko { namespace kernels { namespace GKO_DEVICE_NAMESPACE { @@ -48,7 +51,14 @@ void convert_precision(std::shared_ptr exec, { run_kernel( exec, - [] GKO_KERNEL(auto idx, auto in, auto out) { out[idx] = in[idx]; }, + [] GKO_KERNEL(auto idx, auto in, auto out) { + using in_type = typename std::remove_cv< + typename std::remove_reference::type>::type; + using out_type = typename std::remove_cv< + typename std::remove_reference::type>::type; + out[idx] = + static_cast>(in[idx]); + }, size, in, out); } diff --git a/common/unified/matrix/dense_kernels.template.cpp b/common/unified/matrix/dense_kernels.template.cpp index b6ed5fb37e0..3a4ab98e5f6 100644 --- a/common/unified/matrix/dense_kernels.template.cpp +++ b/common/unified/matrix/dense_kernels.template.cpp @@ -62,7 +62,9 @@ void copy(std::shared_ptr exec, run_kernel( exec, [] GKO_KERNEL(auto row, auto col, auto input, auto output) { - output(row, col) = input(row, col); + using type = + device_type>; + output(row, col) = static_cast(input(row, col)); }, input->get_size(), input, output); } diff --git a/core/base/batch_multi_vector.cpp b/core/base/batch_multi_vector.cpp index 3774b6aad58..5dc119c71e5 100644 --- a/core/base/batch_multi_vector.cpp +++ b/core/base/batch_multi_vector.cpp @@ -309,6 +309,23 @@ void MultiVector::move_to( #endif +template +void MultiVector::convert_to( + MultiVector>* result) const +{ + result->values_ = this->values_; + result->set_size(this->get_size()); +} + + +template +void MultiVector::move_to( + MultiVector>* result) +{ + this->convert_to(result); +} + + #define GKO_DECLARE_BATCH_MULTI_VECTOR(_type) class MultiVector<_type> GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_MULTI_VECTOR); diff --git a/core/base/mixed_precision_types.hpp b/core/base/mixed_precision_types.hpp index 91aa9e4eefa..f23a3352ed0 100644 --- a/core/base/mixed_precision_types.hpp +++ b/core/base/mixed_precision_types.hpp @@ -44,41 +44,69 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT1(_macro, ...) \ GKO_ADAPT_HF(_macro(float, half, half, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(float, half, bfloat16, __VA_ARGS__)); \ GKO_ADAPT_HF(_macro(float, half, float, __VA_ARGS__)); \ GKO_ADAPT_HF(_macro(float, half, double, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(float, bfloat16, half, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(float, bfloat16, bfloat16, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(float, bfloat16, float, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(float, bfloat16, double, __VA_ARGS__)); \ GKO_ADAPT_HF(_macro(float, float, half, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(float, float, bfloat16, __VA_ARGS__)); \ template _macro(float, float, float, __VA_ARGS__); \ template _macro(float, float, double, __VA_ARGS__); \ GKO_ADAPT_HF(_macro(float, double, half, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(float, double, bfloat16, __VA_ARGS__)); \ template _macro(float, double, float, __VA_ARGS__); \ template _macro(float, double, double, __VA_ARGS__) #define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT2(_macro, ...) \ GKO_ADAPT_HF(_macro(double, half, half, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(double, half, bfloat16, __VA_ARGS__)); \ GKO_ADAPT_HF(_macro(double, half, float, __VA_ARGS__)); \ GKO_ADAPT_HF(_macro(double, half, double, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(double, bfloat16, half, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(double, bfloat16, bfloat16, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(double, bfloat16, float, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(double, bfloat16, double, __VA_ARGS__)); \ GKO_ADAPT_HF(_macro(double, float, half, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(double, float, bfloat16, __VA_ARGS__)); \ template _macro(double, float, float, __VA_ARGS__); \ template _macro(double, float, double, __VA_ARGS__); \ GKO_ADAPT_HF(_macro(double, double, half, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(double, double, bfloat16, __VA_ARGS__)); \ template _macro(double, double, float, __VA_ARGS__); \ template _macro(double, double, double, __VA_ARGS__) #define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT3(_macro, ...) \ GKO_ADAPT_HF(_macro(std::complex, std::complex, \ std::complex, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ GKO_ADAPT_HF(_macro(std::complex, std::complex, \ std::complex, __VA_ARGS__)); \ GKO_ADAPT_HF(_macro(std::complex, std::complex, \ std::complex, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ GKO_ADAPT_HF(_macro(std::complex, std::complex, \ std::complex, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ template _macro(std::complex, std::complex, \ std::complex, __VA_ARGS__); \ template _macro(std::complex, std::complex, \ std::complex, __VA_ARGS__); \ GKO_ADAPT_HF(_macro(std::complex, std::complex, \ std::complex, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ template _macro(std::complex, std::complex, \ std::complex, __VA_ARGS__); \ template _macro(std::complex, std::complex, \ @@ -87,18 +115,32 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT4(_macro, ...) \ GKO_ADAPT_HF(_macro(std::complex, std::complex, \ std::complex, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ GKO_ADAPT_HF(_macro(std::complex, std::complex, \ std::complex, __VA_ARGS__)); \ GKO_ADAPT_HF(_macro(std::complex, std::complex, \ std::complex, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ GKO_ADAPT_HF(_macro(std::complex, std::complex, \ std::complex, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ template _macro(std::complex, std::complex, \ std::complex, __VA_ARGS__); \ template _macro(std::complex, std::complex, \ std::complex, __VA_ARGS__); \ GKO_ADAPT_HF(_macro(std::complex, std::complex, \ std::complex, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ template _macro(std::complex, std::complex, \ std::complex, __VA_ARGS__); \ template _macro(std::complex, std::complex, \ @@ -106,35 +148,108 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT5(_macro, ...) \ GKO_ADAPT_HF(_macro(half, half, half, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(half, half, bfloat16, __VA_ARGS__)); \ GKO_ADAPT_HF(_macro(half, half, float, __VA_ARGS__)); \ GKO_ADAPT_HF(_macro(half, half, double, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(half, bfloat16, half, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(half, bfloat16, bfloat16, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(half, bfloat16, float, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(half, bfloat16, double, __VA_ARGS__)); \ GKO_ADAPT_HF(_macro(half, float, half, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(half, float, bfloat16, __VA_ARGS__)); \ GKO_ADAPT_HF(_macro(half, float, float, __VA_ARGS__)); \ GKO_ADAPT_HF(_macro(half, float, double, __VA_ARGS__)); \ GKO_ADAPT_HF(_macro(half, double, half, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(half, double, bfloat16, __VA_ARGS__)); \ GKO_ADAPT_HF(_macro(half, double, float, __VA_ARGS__)); \ GKO_ADAPT_HF(_macro(half, double, double, __VA_ARGS__)) #define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT6(_macro, ...) \ GKO_ADAPT_HF(_macro(std::complex, std::complex, \ std::complex, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ GKO_ADAPT_HF(_macro(std::complex, std::complex, \ std::complex, __VA_ARGS__)); \ GKO_ADAPT_HF(_macro(std::complex, std::complex, \ std::complex, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ GKO_ADAPT_HF(_macro(std::complex, std::complex, \ std::complex, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ GKO_ADAPT_HF(_macro(std::complex, std::complex, \ std::complex, __VA_ARGS__)); \ GKO_ADAPT_HF(_macro(std::complex, std::complex, \ std::complex, __VA_ARGS__)); \ GKO_ADAPT_HF(_macro(std::complex, std::complex, \ std::complex, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ GKO_ADAPT_HF(_macro(std::complex, std::complex, \ std::complex, __VA_ARGS__)); \ GKO_ADAPT_HF(_macro(std::complex, std::complex, \ std::complex, __VA_ARGS__)) +#define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT7(_macro, ...) \ + GKO_ADAPT_HF(_macro(bfloat16, half, half, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(bfloat16, half, bfloat16, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(bfloat16, half, float, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(bfloat16, half, double, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(bfloat16, bfloat16, half, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(bfloat16, bfloat16, bfloat16, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(bfloat16, bfloat16, float, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(bfloat16, bfloat16, double, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(bfloat16, float, half, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(bfloat16, float, bfloat16, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(bfloat16, float, float, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(bfloat16, float, double, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(bfloat16, double, half, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(bfloat16, double, bfloat16, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(bfloat16, double, float, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(bfloat16, double, double, __VA_ARGS__)) + +#define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT8(_macro, ...) \ + GKO_ADAPT_HF(_macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)) + #else @@ -159,6 +274,13 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. GKO_ADAPT_HF(_macro(std::complex, std::complex, \ std::complex, __VA_ARGS__)) +#define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT7(_macro, ...) \ + GKO_ADAPT_HF(_macro(bfloat16, bfloat16, bfloat16, __VA_ARGS__)) + +#define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT8(_macro, ...) \ + GKO_ADAPT_HF(_macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)) + #endif @@ -169,7 +291,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT3(_macro, __VA_ARGS__); \ GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT4(_macro, __VA_ARGS__); \ GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT5(_macro, __VA_ARGS__); \ - GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT6(_macro, __VA_ARGS__) + GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT6(_macro, __VA_ARGS__); \ + GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT7(_macro, __VA_ARGS__); \ + GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT8(_macro, __VA_ARGS__) #define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE(_macro) \ @@ -180,33 +304,57 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #ifdef GINKGO_MIXED_PRECISION #define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_2(_macro, ...) \ GKO_ADAPT_HF(_macro(half, half, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(half, bfloat16, __VA_ARGS__)); \ GKO_ADAPT_HF(_macro(half, float, __VA_ARGS__)); \ GKO_ADAPT_HF(_macro(half, double, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(bfloat16, half, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(bfloat16, bfloat16, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(bfloat16, float, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(bfloat16, double, __VA_ARGS__)); \ GKO_ADAPT_HF(_macro(float, half, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(float, bfloat16, __VA_ARGS__)); \ template _macro(float, float, __VA_ARGS__); \ template _macro(float, double, __VA_ARGS__); \ GKO_ADAPT_HF(_macro(double, half, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(double, bfloat16, __VA_ARGS__)); \ template _macro(double, float, __VA_ARGS__); \ template _macro(double, double, __VA_ARGS__); \ GKO_ADAPT_HF(_macro(std::complex, std::complex, __VA_ARGS__)); \ + GKO_ADAPT_HF( \ + _macro(std::complex, std::complex, __VA_ARGS__)); \ GKO_ADAPT_HF( \ _macro(std::complex, std::complex, __VA_ARGS__)); \ GKO_ADAPT_HF( \ _macro(std::complex, std::complex, __VA_ARGS__)); \ + GKO_ADAPT_HF( \ + _macro(std::complex, std::complex, __VA_ARGS__)); \ + GKO_ADAPT_HF( \ + _macro(std::complex, std::complex, __VA_ARGS__)); \ + GKO_ADAPT_HF( \ + _macro(std::complex, std::complex, __VA_ARGS__)); \ + GKO_ADAPT_HF( \ + _macro(std::complex, std::complex, __VA_ARGS__)); \ GKO_ADAPT_HF( \ _macro(std::complex, std::complex, __VA_ARGS__)); \ + GKO_ADAPT_HF( \ + _macro(std::complex, std::complex, __VA_ARGS__)); \ template _macro(std::complex, std::complex, __VA_ARGS__); \ template _macro(std::complex, std::complex, __VA_ARGS__); \ GKO_ADAPT_HF( \ _macro(std::complex, std::complex, __VA_ARGS__)); \ + GKO_ADAPT_HF( \ + _macro(std::complex, std::complex, __VA_ARGS__)); \ template _macro(std::complex, std::complex, __VA_ARGS__); \ template _macro(std::complex, std::complex, __VA_ARGS__) #else #define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_2(_macro, ...) \ GKO_ADAPT_HF(_macro(half, half, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(bfloat16, bfloat16, __VA_ARGS__)); \ template _macro(float, float, __VA_ARGS__); \ template _macro(double, double, __VA_ARGS__); \ GKO_ADAPT_HF(_macro(std::complex, std::complex, __VA_ARGS__)); \ + GKO_ADAPT_HF( \ + _macro(std::complex, std::complex, __VA_ARGS__)); \ template _macro(std::complex, std::complex, __VA_ARGS__); \ template _macro(std::complex, std::complex, __VA_ARGS__) #endif diff --git a/core/base/mtx_io.cpp b/core/base/mtx_io.cpp index 2616feb0530..a0f612ca2fd 100644 --- a/core/base/mtx_io.cpp +++ b/core/base/mtx_io.cpp @@ -780,6 +780,7 @@ static constexpr uint64 binary_format_magic() constexpr auto is_int = std::is_same::value; constexpr auto is_long = std::is_same::value; constexpr auto is_half = std::is_same::value; + constexpr auto is_bfloat16 = std::is_same::value; constexpr auto is_double = std::is_same::value; constexpr auto is_float = std::is_same::value; constexpr auto is_complex_double = @@ -788,8 +789,11 @@ static constexpr uint64 binary_format_magic() std::is_same>::value; constexpr auto is_complex_half = std::is_same>::value; + constexpr auto is_complex_bfloat16 = + std::is_same>::value; static_assert(is_int || is_long, "invalid storage index type"); - static_assert(is_half || is_complex_half || is_double || is_float || + static_assert(is_bfloat16 || is_complex_bfloat16 || is_half || + is_complex_half || is_double || is_float || is_complex_double || is_complex_float, "invalid storage value type"); constexpr auto index_bit = is_int ? 'I' : 'L'; @@ -800,7 +804,10 @@ static constexpr uint64 binary_format_magic() ? 'S' : (is_complex_double ? 'Z' - : (is_complex_float ? 'C' : (is_half ? 'H' : 'X')))); + : (is_complex_float + ? 'C' + : (is_half ? 'H' + : (is_bfloat16 ? 'B' : 'X'))))); constexpr uint64 shift = 256; constexpr uint64 type_bits = index_bit * shift + value_bit; return 'G' + diff --git a/core/distributed/matrix.cpp b/core/distributed/matrix.cpp index 2325047cc78..d0d49583533 100644 --- a/core/distributed/matrix.cpp +++ b/core/distributed/matrix.cpp @@ -177,6 +177,45 @@ void Matrix::move_to( result->set_size(this->get_size()); this->set_size({}); } + + +template +void Matrix::convert_to( + Matrix, local_index_type, global_index_type>* + result) const +{ + GKO_ASSERT(this->get_communicator().size() == + result->get_communicator().size()); + result->local_mtx_->copy_from(this->local_mtx_.get()); + result->non_local_mtx_->copy_from(this->non_local_mtx_.get()); + result->gather_idxs_ = this->gather_idxs_; + result->send_offsets_ = this->send_offsets_; + result->recv_offsets_ = this->recv_offsets_; + result->recv_sizes_ = this->recv_sizes_; + result->send_sizes_ = this->send_sizes_; + result->non_local_to_global_ = this->non_local_to_global_; + result->set_size(this->get_size()); +} + + +template +void Matrix::move_to( + Matrix, local_index_type, global_index_type>* + result) +{ + GKO_ASSERT(this->get_communicator().size() == + result->get_communicator().size()); + result->local_mtx_->move_from(this->local_mtx_.get()); + result->non_local_mtx_->move_from(this->non_local_mtx_.get()); + result->gather_idxs_ = std::move(this->gather_idxs_); + result->send_offsets_ = std::move(this->send_offsets_); + result->recv_offsets_ = std::move(this->recv_offsets_); + result->recv_sizes_ = std::move(this->recv_sizes_); + result->send_sizes_ = std::move(this->send_sizes_); + result->non_local_to_global_ = std::move(this->non_local_to_global_); + result->set_size(this->get_size()); + this->set_size({}); +} #endif template diff --git a/core/distributed/vector.cpp b/core/distributed/vector.cpp index 4251ad43f06..5b1549b393a 100644 --- a/core/distributed/vector.cpp +++ b/core/distributed/vector.cpp @@ -308,6 +308,24 @@ void Vector::move_to( { this->convert_to(result); } + + +template +void Vector::convert_to( + Vector>* result) const +{ + GKO_ASSERT(this->get_communicator().size() == + result->get_communicator().size()); + result->set_size(this->get_size()); + this->get_local_vector()->convert_to(&result->local_); +} + + +template +void Vector::move_to(Vector>* result) +{ + this->convert_to(result); +} #endif template diff --git a/core/matrix/coo.cpp b/core/matrix/coo.cpp index 104802775ec..723e827aee5 100644 --- a/core/matrix/coo.cpp +++ b/core/matrix/coo.cpp @@ -162,6 +162,25 @@ void Coo::move_to( { this->convert_to(result); } + + +template +void Coo::convert_to( + Coo, IndexType>* result) const +{ + result->values_ = this->values_; + result->row_idxs_ = this->row_idxs_; + result->col_idxs_ = this->col_idxs_; + result->set_size(this->get_size()); +} + + +template +void Coo::move_to( + Coo, IndexType>* result) +{ + this->convert_to(result); +} #endif diff --git a/core/matrix/csr.cpp b/core/matrix/csr.cpp index 24ba1c2aebf..f8466fa752d 100644 --- a/core/matrix/csr.cpp +++ b/core/matrix/csr.cpp @@ -276,6 +276,26 @@ void Csr::move_to( { this->convert_to(result); } + + +template +void Csr::convert_to( + Csr, IndexType>* result) const +{ + result->values_ = this->values_; + result->col_idxs_ = this->col_idxs_; + result->row_ptrs_ = this->row_ptrs_; + result->set_size(this->get_size()); + convert_strategy_helper(result); +} + + +template +void Csr::move_to( + Csr, IndexType>* result) +{ + this->convert_to(result); +} #endif diff --git a/core/matrix/dense.cpp b/core/matrix/dense.cpp index 5ea55ced906..24f1d200375 100644 --- a/core/matrix/dense.cpp +++ b/core/matrix/dense.cpp @@ -608,6 +608,29 @@ void Dense::move_to( { this->convert_to(result); } + + +template +void Dense::convert_to( + Dense>* result) const +{ + if (result->get_size() != this->get_size()) { + result->set_size(this->get_size()); + result->stride_ = stride_; + result->values_.resize_and_reset(result->get_size()[0] * + result->stride_); + } + auto exec = this->get_executor(); + exec->run(dense::make_copy( + this, make_temporary_output_clone(exec, result).get())); +} + + +template +void Dense::move_to(Dense>* result) +{ + this->convert_to(result); +} #endif diff --git a/core/matrix/diagonal.cpp b/core/matrix/diagonal.cpp index 0d4540a615b..f4d85469780 100644 --- a/core/matrix/diagonal.cpp +++ b/core/matrix/diagonal.cpp @@ -209,6 +209,22 @@ void Diagonal::move_to( { this->convert_to(result); } + +template +void Diagonal::convert_to( + Diagonal>* result) const +{ + result->values_ = this->values_; + result->set_size(this->get_size()); +} + + +template +void Diagonal::move_to( + Diagonal>* result) +{ + this->convert_to(result); +} #endif diff --git a/core/matrix/ell.cpp b/core/matrix/ell.cpp index 9f37d0a85f7..8bdbeed628f 100644 --- a/core/matrix/ell.cpp +++ b/core/matrix/ell.cpp @@ -221,6 +221,26 @@ void Ell::move_to( { this->convert_to(result); } + + +template +void Ell::convert_to( + Ell, IndexType>* result) const +{ + result->values_ = this->values_; + result->col_idxs_ = this->col_idxs_; + result->num_stored_elements_per_row_ = this->num_stored_elements_per_row_; + result->stride_ = this->stride_; + result->set_size(this->get_size()); +} + + +template +void Ell::move_to( + Ell, IndexType>* result) +{ + this->convert_to(result); +} #endif diff --git a/core/matrix/fbcsr.cpp b/core/matrix/fbcsr.cpp index 14b9c226bc8..8842a4b0c3a 100644 --- a/core/matrix/fbcsr.cpp +++ b/core/matrix/fbcsr.cpp @@ -219,6 +219,27 @@ void Fbcsr::move_to( { this->convert_to(result); } + + +template +void Fbcsr::convert_to( + Fbcsr, IndexType>* const result) const +{ + result->values_ = this->values_; + result->col_idxs_ = this->col_idxs_; + result->row_ptrs_ = this->row_ptrs_; + result->set_size(this->get_size()); + // block sizes are immutable except for assignment/conversion + result->bs_ = this->bs_; +} + + +template +void Fbcsr::move_to( + Fbcsr, IndexType>* const result) +{ + this->convert_to(result); +} #endif diff --git a/core/matrix/hybrid.cpp b/core/matrix/hybrid.cpp index d6c802cfa01..fd71683404d 100644 --- a/core/matrix/hybrid.cpp +++ b/core/matrix/hybrid.cpp @@ -201,6 +201,27 @@ void Hybrid::move_to( { this->convert_to(result); } + + +template +void Hybrid::convert_to( + Hybrid, IndexType>* result) const +{ + this->ell_->convert_to(result->ell_.get()); + this->coo_->convert_to(result->coo_.get()); + // TODO set strategy correctly + // There is no way to correctly clone the strategy like in + // Csr::convert_to + result->set_size(this->get_size()); +} + + +template +void Hybrid::move_to( + Hybrid, IndexType>* result) +{ + this->convert_to(result); +} #endif diff --git a/core/matrix/row_gatherer.cpp b/core/matrix/row_gatherer.cpp index b3807f2514e..2c084a253a6 100644 --- a/core/matrix/row_gatherer.cpp +++ b/core/matrix/row_gatherer.cpp @@ -49,11 +49,12 @@ void RowGatherer::apply_impl(const LinOp* in, LinOp* out) const { run< #if GINKGO_ENABLE_HALF - const Dense*, + const Dense*, const Dense*, #endif const Dense*, const Dense*, #if GINKGO_ENABLE_HALF const Dense>*, + const Dense>*, #endif const Dense>*, const Dense>*>( in, [&](auto gather) { gather->row_gather(&row_idxs_, out); }); @@ -65,11 +66,12 @@ void RowGatherer::apply_impl(const LinOp* alpha, const LinOp* in, { run< #if GINKGO_ENABLE_HALF - const Dense*, + const Dense*, const Dense*, #endif const Dense*, const Dense*, #if GINKGO_ENABLE_HALF const Dense>*, + const Dense>*, #endif const Dense>*, const Dense>*>( in, diff --git a/core/matrix/sellp.cpp b/core/matrix/sellp.cpp index 4f36fb1b6c1..880fc36d827 100644 --- a/core/matrix/sellp.cpp +++ b/core/matrix/sellp.cpp @@ -199,6 +199,28 @@ void Sellp::move_to( { this->convert_to(result); } + + +template +void Sellp::convert_to( + Sellp, IndexType>* result) const +{ + result->values_ = this->values_; + result->col_idxs_ = this->col_idxs_; + result->slice_lengths_ = this->slice_lengths_; + result->slice_sets_ = this->slice_sets_; + result->slice_size_ = this->slice_size_; + result->stride_factor_ = this->stride_factor_; + result->set_size(this->get_size()); +} + + +template +void Sellp::move_to( + Sellp, IndexType>* result) +{ + this->convert_to(result); +} #endif diff --git a/core/solver/multigrid.cpp b/core/solver/multigrid.cpp index 7e41c02780e..de4f711d423 100644 --- a/core/solver/multigrid.cpp +++ b/core/solver/multigrid.cpp @@ -317,11 +317,11 @@ void MultigridState::generate(const LinOp* system_matrix_in, run, + std::complex, std::complex, #endif std::complex, std::complex>( mg_level, @@ -381,11 +381,11 @@ void MultigridState::run_mg_cycle(multigrid::cycle cycle, size_type level, auto mg_level = multigrid->get_mg_level_list().at(level); run, + std::complex, std::complex, #endif std::complex, std::complex>( mg_level, [&, this](auto mg_level) { @@ -533,11 +533,11 @@ void Multigrid::generate() run, + std::complex, std::complex, #endif std::complex, std::complex>( mg_level, @@ -578,11 +578,11 @@ void Multigrid::generate() // generate coarsest solver run, + std::complex, std::complex, #endif std::complex, std::complex>( last_mg_level, @@ -671,11 +671,11 @@ void Multigrid::apply_with_initial_guess_impl(const LinOp* b, LinOp* x, auto first_mg_level = this->get_mg_level_list().front(); run, + std::complex, std::complex, #endif std::complex, std::complex>(first_mg_level, lambda, b, x); @@ -717,11 +717,11 @@ void Multigrid::apply_with_initial_guess_impl(const LinOp* alpha, auto first_mg_level = this->get_mg_level_list().front(); run, + std::complex, std::complex, #endif std::complex, std::complex>(first_mg_level, lambda, alpha, b, beta, x); @@ -789,11 +789,11 @@ void Multigrid::apply_dense_impl(const VectorType* b, VectorType* x, run, + std::complex, std::complex, #endif std::complex, std::complex>(first_mg_level, lambda, b, x); diff --git a/core/test/base/extended_bfloat16.cpp b/core/test/base/extended_bfloat16.cpp index 280b7947f8a..4681f292325 100644 --- a/core/test/base/extended_bfloat16.cpp +++ b/core/test/base/extended_bfloat16.cpp @@ -30,9 +30,6 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *************************************************************/ -#include "core/base/extended_float.hpp" - - #include #include #include @@ -43,6 +40,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include + +#include "core/base/extended_float.hpp" + namespace { diff --git a/core/test/utils.hpp b/core/test/utils.hpp index 098b6355cee..554568ed81a 100644 --- a/core/test/utils.hpp +++ b/core/test/utils.hpp @@ -360,6 +360,11 @@ struct next_precision_impl { using type = float; }; +template <> +struct next_precision_impl { + using type = float; +}; + template <> struct next_precision_impl { using type = double; diff --git a/cuda/base/batch_multi_vector_kernels.cu b/cuda/base/batch_multi_vector_kernels.cu index 7729d006b75..23fe80c2a1b 100644 --- a/cuda/base/batch_multi_vector_kernels.cu +++ b/cuda/base/batch_multi_vector_kernels.cu @@ -41,6 +41,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include "cuda/base/types.hpp" + + #include "core/base/batch_struct.hpp" #include "cuda/base/batch_struct.hpp" #include "cuda/base/config.hpp" diff --git a/cuda/base/types.cpp b/cuda/base/types.cpp new file mode 100644 index 00000000000..130f8baca57 --- /dev/null +++ b/cuda/base/types.cpp @@ -0,0 +1,33 @@ +#include "cuda/base/types.hpp" + + +#if defined(__CUDACC__) + +#define BFLOAT_FRIEND_OPERATOR(_op, _opeq) \ + __device__ __forceinline__ __nv_bfloat16 operator _op( \ + const __nv_bfloat16& lhs, const __nv_bfloat16& rhs) \ + { \ + return static_cast<__nv_bfloat16>(static_cast(lhs) \ + _op static_cast(rhs)); \ + } \ + __device__ __forceinline__ __nv_bfloat16& operator _opeq( \ + __nv_bfloat16& lhs, const __nv_bfloat16& rhs) \ + { \ + lhs = static_cast(lhs) _op static_cast(rhs); \ + return lhs; \ + } +BFLOAT_FRIEND_OPERATOR(+, +=) +BFLOAT_FRIEND_OPERATOR(-, -=) +BFLOAT_FRIEND_OPERATOR(*, *=) +BFLOAT_FRIEND_OPERATOR(/, /=) + +__device__ __forceinline__ __nv_bfloat16 operator+(const __nv_bfloat16& h) +{ + return h; +} +__device__ __forceinline__ __nv_bfloat16 operator-(const __nv_bfloat16& h) +{ + return -float{h}; +} + +#endif \ No newline at end of file diff --git a/cuda/base/types.hpp b/cuda/base/types.hpp index fe4bc40c2de..c3e50d7e4a5 100644 --- a/cuda/base/types.hpp +++ b/cuda/base/types.hpp @@ -42,6 +42,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include +#include #include #include #include @@ -51,6 +52,39 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +// #if defined(__CUDACC__) + +// #define BFLOAT_FRIEND_OPERATOR(_op, _opeq) \ +// __forceinline__ __device__ __nv_bfloat16 operator _op( \ +// const __nv_bfloat16& lhs, const __nv_bfloat16& rhs) \ +// { \ +// return static_cast<__nv_bfloat16>(static_cast(lhs) \ +// _op static_cast(rhs)); \ +// } \ +// __forceinline__ __device__ __nv_bfloat16& operator _opeq( \ +// __nv_bfloat16& lhs, const __nv_bfloat16& rhs) \ +// { \ +// lhs = static_cast(lhs) _op static_cast(rhs); \ +// return lhs; \ +// } +// BFLOAT_FRIEND_OPERATOR(+, +=) +// BFLOAT_FRIEND_OPERATOR(-, -=) +// BFLOAT_FRIEND_OPERATOR(*, *=) +// BFLOAT_FRIEND_OPERATOR(/, /=) + +// __forceinline__ __device__ __nv_bfloat16 operator+(const __nv_bfloat16& h) +// { +// return h; +// } +// __forceinline__ __device__ __nv_bfloat16 operator-(const __nv_bfloat16& h) +// { +// return -float{h}; +// } +// #undef BFLOAT_FRIEND_OPERATOR + +// #endif + + // thrust calls the c function not the function from std // Maybe override the function from thrust directlry GKO_ATTRIBUTES GKO_INLINE __half hypot(__half a, __half b) @@ -64,6 +98,17 @@ GKO_ATTRIBUTES GKO_INLINE thrust::complex<__half> sqrt( return sqrt(static_cast>(a)); } +GKO_ATTRIBUTES GKO_INLINE __nv_bfloat16 hypot(__nv_bfloat16 a, __nv_bfloat16 b) +{ + return hypot(static_cast(a), static_cast(b)); +} + +GKO_ATTRIBUTES GKO_INLINE thrust::complex<__nv_bfloat16> sqrt( + thrust::complex<__nv_bfloat16> a) +{ + return sqrt(static_cast>(a)); +} + namespace thrust { @@ -75,6 +120,13 @@ GKO_ATTRIBUTES GKO_INLINE __half abs<__half>(const complex<__half>& z) return abs(static_cast>(z)); } +template <> +GKO_ATTRIBUTES GKO_INLINE __nv_bfloat16 +abs<__nv_bfloat16>(const complex<__nv_bfloat16>& z) +{ + return abs(static_cast>(z)); +} + } // namespace thrust @@ -84,6 +136,12 @@ GKO_ATTRIBUTES GKO_INLINE __half abs<__half>(const complex<__half>& z) const thrust::complex<__half> lhs, const thrust::complex<__half> rhs) \ { \ return thrust::complex{lhs} _op thrust::complex(rhs); \ + } \ + GKO_ATTRIBUTES GKO_INLINE thrust::complex<__nv_bfloat16> operator _op( \ + const thrust::complex<__nv_bfloat16>& lhs, \ + const thrust::complex<__nv_bfloat16>& rhs) \ + { \ + return thrust::complex{lhs} _op thrust::complex(rhs); \ } THRUST_HALF_FRIEND_OPERATOR(+, +=) @@ -108,6 +166,12 @@ __device__ __forceinline__ bool is_nan(const __half& val) return __hisnan(val); } +template <> +__device__ __forceinline__ bool is_nan(const __nv_bfloat16& val) +{ + return isnan(static_cast(val)); +} + #else @@ -118,6 +182,12 @@ __device__ __forceinline__ bool is_nan(const __half& val) return isnan(static_cast(val)); } +template <> +__device__ __forceinline__ bool is_nan(const __nv_bfloat16& val) +{ + return isnan(static_cast(val)); +} + #endif @@ -128,6 +198,13 @@ __device__ __forceinline__ bool is_nan(const thrust::complex<__half>& val) return is_nan(val.real()) || is_nan(val.imag()); } +template <> +__device__ __forceinline__ bool is_nan( + const thrust::complex<__nv_bfloat16>& val) +{ + return is_nan(val.real()) || is_nan(val.imag()); +} + #endif @@ -141,16 +218,31 @@ namespace cuda { #if CUDA_VERSION >= 10020 __device__ __forceinline__ __half abs(const __half& val) { return __habs(val); } + +__device__ __forceinline__ __nv_bfloat16 abs(const __nv_bfloat16& val) +{ + return abs(static_cast(val)); +} #else __device__ __forceinline__ __half abs(const __half& val) { return abs(static_cast(val)); } + +__device__ __forceinline__ __nv_bfloat16 abs(const __nv_bfloat16& val) +{ + return abs(static_cast(val)); +} #endif __device__ __forceinline__ __half sqrt(const __half& val) { return hsqrt(val); } +__device__ __forceinline__ __nv_bfloat16 sqrt(const __nv_bfloat16& val) +{ + return sqrt(static_cast(val)); +} + #else @@ -160,12 +252,22 @@ __device__ __forceinline__ __half abs(const __half& val) return abs(static_cast(val)); } +__device__ __forceinline__ __nv_bfloat16 abs(const __nv_bfloat16& val) +{ + return abs(static_cast(val)); +} + __device__ __forceinline__ __half sqrt(const __half& val) { return sqrt(static_cast(val)); } +__device__ __forceinline__ __nv_bfloat16 sqrt(const __nv_bfloat16& val) +{ + return sqrt(static_cast(val)); +} + #endif #endif @@ -274,11 +376,21 @@ struct culibs_type_impl { using type = __half; }; +template <> +struct culibs_type_impl { + using type = __nv_bfloat16; +}; + template <> struct culibs_type_impl> { using type = __half2; }; +template <> +struct culibs_type_impl> { + using type = __nv_bfloat162; +}; + template struct culibs_type_impl> { using type = typename culibs_type_impl>::type; @@ -314,6 +426,11 @@ struct cuda_type_impl { using type = __half; }; +template <> +struct cuda_type_impl { + using type = __nv_bfloat16; +}; + template struct cuda_type_impl> { using type = thrust::complex::type>; @@ -334,6 +451,11 @@ struct cuda_type_impl<__half2> { using type = thrust::complex<__half>; }; +template <> +struct cuda_type_impl<__nv_bfloat162> { + using type = thrust::complex<__nv_bfloat16>; +}; + template struct cuda_struct_member_type_impl { using type = T; @@ -349,6 +471,11 @@ struct cuda_struct_member_type_impl { using type = __half; }; +template <> +struct cuda_struct_member_type_impl { + using type = __nv_bfloat16; +}; + template struct cuda_type_impl> { using type = matrix_data_entry< @@ -366,11 +493,13 @@ struct cuda_data_type_impl {}; } GKO_CUDA_DATA_TYPE(float16, CUDA_R_16F); +GKO_CUDA_DATA_TYPE(bfloat16, CUDA_R_16BF); GKO_CUDA_DATA_TYPE(float, CUDA_R_32F); GKO_CUDA_DATA_TYPE(double, CUDA_R_64F); GKO_CUDA_DATA_TYPE(std::complex, CUDA_C_32F); GKO_CUDA_DATA_TYPE(std::complex, CUDA_C_64F); GKO_CUDA_DATA_TYPE(std::complex, CUDA_C_16F); +GKO_CUDA_DATA_TYPE(std::complex, CUDA_C_16BF); GKO_CUDA_DATA_TYPE(int32, CUDA_R_32I); GKO_CUDA_DATA_TYPE(int8, CUDA_R_8I); diff --git a/cuda/factorization/cholesky_kernels.cu b/cuda/factorization/cholesky_kernels.cu index 30fd249530b..076ed9a2546 100644 --- a/cuda/factorization/cholesky_kernels.cu +++ b/cuda/factorization/cholesky_kernels.cu @@ -56,6 +56,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "cuda/base/cusparse_bindings.hpp" #include "cuda/base/math.hpp" #include "cuda/base/thrust.cuh" +#include "cuda/base/types.hpp" #include "cuda/components/cooperative_groups.cuh" #include "cuda/components/intrinsics.cuh" #include "cuda/components/reduction.cuh" diff --git a/include/ginkgo/core/base/batch_multi_vector.hpp b/include/ginkgo/core/base/batch_multi_vector.hpp index dd1ee930ca7..f44eb81ee9b 100644 --- a/include/ginkgo/core/base/batch_multi_vector.hpp +++ b/include/ginkgo/core/base/batch_multi_vector.hpp @@ -84,6 +84,7 @@ class MultiVector #if GINKGO_ENABLE_HALF public ConvertibleTo< MultiVector>>>, + public ConvertibleTo>>, #endif public ConvertibleTo>> { friend class EnableCreateMethod; @@ -129,6 +130,15 @@ class MultiVector void move_to(MultiVector>>* result) override; + + friend class MultiVector>; + using ConvertibleTo>>::convert_to; + using ConvertibleTo>>::move_to; + + void convert_to( + MultiVector>* result) const override; + + void move_to(MultiVector>* result) override; #endif /** diff --git a/include/ginkgo/core/base/half.hpp b/include/ginkgo/core/base/half.hpp index c75cfd89dcf..4d6858dbab1 100644 --- a/include/ginkgo/core/base/half.hpp +++ b/include/ginkgo/core/base/half.hpp @@ -44,7 +44,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #ifdef __CUDA_ARCH__ - #include #include class hip_bfloat16; @@ -74,6 +73,8 @@ namespace gko { template class truncated; +class bfloat16; + namespace detail { @@ -371,6 +372,8 @@ class half { GKO_ATTRIBUTES half(const half& val) = default; + inline GKO_ATTRIBUTES half(const bfloat16& val); + template GKO_ATTRIBUTES half& operator=(const V val) { @@ -537,6 +540,9 @@ class bfloat16 { GKO_ATTRIBUTES bfloat16(const bfloat16& val) = default; + GKO_ATTRIBUTES bfloat16(const half& val) : bfloat16(static_cast(val)) + {} + template GKO_ATTRIBUTES bfloat16& operator=(const V val) { @@ -680,11 +686,18 @@ class bfloat16 { }; +inline GKO_ATTRIBUTES half::half(const bfloat16& val) + : half(static_cast(val)) +{} + + } // namespace gko namespace std { +template <> +class complex; template <> class complex { @@ -709,6 +722,11 @@ class complex { imag_(static_cast(0.f)) {} + complex(const gko::bfloat16& real) + : real_(static_cast(real)), + imag_(static_cast(0.f)) + {} + // When using complex(real, imag), MSVC with CUDA try to recognize the // complex is a member not constructor. template ::value>> @@ -717,6 +735,8 @@ class complex { imag_(static_cast(other.imag())) {} + explicit inline complex(const complex& other); + // explicit complex(const complex& other) = default; value_type real() const noexcept { return real_; } @@ -863,6 +883,11 @@ class complex { imag_(static_cast(0.f)) {} + complex(const gko::half& real) + : real_(static_cast(real)), + imag_(static_cast(0.f)) + {} + // When using complex(real, imag), MSVC with CUDA try to recognize the // complex is a member not constructor. template ::value>> @@ -871,6 +896,11 @@ class complex { imag_(static_cast(other.imag())) {} + explicit complex(const complex& other) + : real_(static_cast(other.real())), + imag_(static_cast(other.imag())) + {} + // explicit complex(const complex& other) = default; value_type real() const noexcept { return real_; } @@ -994,6 +1024,12 @@ class complex { }; +inline complex::complex(const complex& other) + : real_(static_cast(other.real())), + imag_(static_cast(other.imag())) +{} + + template <> struct numeric_limits { static constexpr bool is_specialized{true}; diff --git a/include/ginkgo/core/base/math.hpp b/include/ginkgo/core/base/math.hpp index 193fb473b1f..b713ede64db 100644 --- a/include/ginkgo/core/base/math.hpp +++ b/include/ginkgo/core/base/math.hpp @@ -49,6 +49,8 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. class __half; +class __nv_bfloat16; +class hip_bfloat16; namespace thrust { @@ -80,6 +82,30 @@ inline std::complex sqrt(std::complex a) } +inline gko::bfloat16 abs(gko::bfloat16 a) +{ + return gko::bfloat16((a > 0) ? a : -a); +} + +inline gko::bfloat16 abs(std::complex a) +{ + // Using float abs not sqrt on norm to avoid overflow + return gko::bfloat16(abs(std::complex(a))); +} + + +inline gko::bfloat16 sqrt(gko::bfloat16 a) +{ + return gko::bfloat16(sqrt(float(a))); +} + +inline std::complex sqrt(std::complex a) +{ + return std::complex(sqrt(std::complex( + static_cast(a.real()), static_cast(a.imag())))); +} + + } // namespace std @@ -226,6 +252,15 @@ struct is_complex_or_scalar_impl : std::true_type {}; template <> struct is_complex_or_scalar_impl<__half> : std::true_type {}; +template <> +struct is_complex_or_scalar_impl : std::true_type {}; + +template <> +struct is_complex_or_scalar_impl<__nv_bfloat16> : std::true_type {}; + +template <> +struct is_complex_or_scalar_impl : std::true_type {}; + template struct is_complex_or_scalar_impl> : is_complex_or_scalar_impl {}; @@ -443,6 +478,12 @@ struct next_precision_impl {}; #if GINKGO_ENABLE_HALF template <> struct next_precision_impl { + using type = bfloat16; +}; + + +template <> +struct next_precision_impl { using type = float; }; #endif @@ -468,6 +509,19 @@ struct next_precision_impl> { }; +template +struct next_precision_impl2 { + using type = + typename next_precision_impl2::type, + I - 1>::type; +}; + +template +struct next_precision_impl2 { + using type = T; +}; + + template struct reduce_precision_impl { using type = T; @@ -520,6 +574,11 @@ struct arth_type { using type = float; }; +template <> +struct arth_type { + using type = float; +}; + template struct arth_type> { using type = std::complex::type>; @@ -542,11 +601,87 @@ struct highest_precision_impl { using type = decltype(T1{} + T2{}); }; +template <> +struct highest_precision_impl { + using type = float; +}; + +template <> +struct highest_precision_impl { + using type = float; +}; + +template <> +struct highest_precision_impl<__nv_bfloat16, __half> { + using type = float; +}; + +template <> +struct highest_precision_impl<__half, __nv_bfloat16> { + using type = float; +}; + +template <> +struct highest_precision_impl { + using type = float; +}; + +template <> +struct highest_precision_impl<__half, hip_bfloat16> { + using type = float; +}; + + +template <> +struct highest_precision_impl { + using type = double; +}; + +template <> +struct highest_precision_impl<__half, double> { + using type = double; +}; + +template <> +struct highest_precision_impl { + using type = double; +}; + +template <> +struct highest_precision_impl<__nv_bfloat16, double> { + using type = double; +}; + +template <> +struct highest_precision_impl { + using type = float; +}; + +template <> +struct highest_precision_impl<__half, float> { + using type = float; +}; + +template <> +struct highest_precision_impl { + using type = float; +}; + +template <> +struct highest_precision_impl<__nv_bfloat16, float> { + using type = float; +}; + template struct highest_precision_impl, std::complex> { using type = std::complex::type>; }; +template +struct highest_precision_impl, thrust::complex> { + using type = thrust::complex::type>; +}; + template struct highest_precision_variadic { using type = typename highest_precision_impl< @@ -568,6 +703,9 @@ struct highest_precision_variadic { template using next_precision = typename detail::next_precision_impl::type; +template +using next_precision2 = typename detail::next_precision_impl2::type; + /** * Obtains the previous type in the singly-linked precision list. @@ -577,12 +715,29 @@ using next_precision = typename detail::next_precision_impl::type; */ #if GINKGO_ENABLE_HALF template -using previous_precision = next_precision>; +using previous_precision = next_precision>>; #else template using previous_precision = next_precision; #endif +namespace detail { +template +struct previous_precision_impl2 { + using type = + typename previous_precision_impl2, I - 1>::type; +}; + +template +struct previous_precision_impl2 { + using type = T; +}; +} // namespace detail + +template +using previous_precision2 = + typename detail::previous_precision_impl2::type; + /** * Obtains the next type in the hierarchy with lower precision than T. diff --git a/include/ginkgo/core/base/mpi.hpp b/include/ginkgo/core/base/mpi.hpp index 54157663879..5d9d68d19ee 100644 --- a/include/ginkgo/core/base/mpi.hpp +++ b/include/ginkgo/core/base/mpi.hpp @@ -123,6 +123,8 @@ GKO_REGISTER_MPI_TYPE(long double, MPI_LONG_DOUBLE); // TODO: it only works on the transferring GKO_REGISTER_MPI_TYPE(half, MPI_UNSIGNED_SHORT); GKO_REGISTER_MPI_TYPE(std::complex, MPI_FLOAT); +GKO_REGISTER_MPI_TYPE(bfloat16, MPI_UNSIGNED_SHORT); +GKO_REGISTER_MPI_TYPE(std::complex, MPI_FLOAT); #endif // GKO_ENABLE_HALF GKO_REGISTER_MPI_TYPE(std::complex, MPI_C_FLOAT_COMPLEX); GKO_REGISTER_MPI_TYPE(std::complex, MPI_C_DOUBLE_COMPLEX); diff --git a/include/ginkgo/core/base/types.hpp b/include/ginkgo/core/base/types.hpp index c51e52846aa..f7da7f28075 100644 --- a/include/ginkgo/core/base/types.hpp +++ b/include/ginkgo/core/base/types.hpp @@ -450,12 +450,14 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, #if GINKGO_DPCPP_SINGLE_MODE #define GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE(_macro) \ GKO_ADAPT_HF(_macro(half)); \ + GKO_ADAPT_HF(_macro(bfloat16)); \ template _macro(float); \ template <> \ _macro(double) GKO_NOT_IMPLEMENTED #else #define GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE(_macro) \ GKO_ADAPT_HF(_macro(half)); \ + GKO_ADAPT_HF(_macro(bfloat16)); \ template _macro(float); \ template _macro(double) #endif @@ -477,6 +479,7 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, #define GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(_macro) \ GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE(_macro); \ GKO_ADAPT_HF(_macro(std::complex)); \ + GKO_ADAPT_HF(_macro(std::complex)); \ template _macro(std::complex); \ template <> \ _macro(std::complex) GKO_NOT_IMPLEMENTED @@ -484,6 +487,7 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, #define GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(_macro) \ GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE(_macro); \ GKO_ADAPT_HF(_macro(std::complex)); \ + GKO_ADAPT_HF(_macro(std::complex)); \ template _macro(std::complex); \ template _macro(std::complex) #endif @@ -507,27 +511,33 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, #if GINKGO_DPCPP_SINGLE_MODE #define GKO_INSTANTIATE_FOR_EACH_VALUE_AND_SCALAR_TYPE(_macro) \ GKO_ADAPT_HF(_macro(half, half)); \ + GKO_ADAPT_HF(_macro(bfloat16, bfloat16)); \ template _macro(float, float); \ template <> \ _macro(double, double) GKO_NOT_IMPLEMENTED; \ GKO_ADAPT_HF(_macro(std::complex, std::complex)); \ + GKO_ADAPT_HF(_macro(std::complex, std::complex)); \ template _macro(std::complex, std::complex); \ template <> \ _macro(std::complex, std::complex) GKO_NOT_IMPLEMENTED; \ GKO_ADAPT_HF(_macro(std::complex, half)); \ + GKO_ADAPT_HF(_macro(std::complex, bfloat16)); \ template _macro(std::complex, float); \ template <> \ _macro(std::complex, double) GKO_NOT_IMPLEMENTED; #else -#define GKO_INSTANTIATE_FOR_EACH_VALUE_AND_SCALAR_TYPE(_macro) \ - GKO_ADAPT_HF(_macro(half, half)); \ - template _macro(float, float); \ - template _macro(double, double); \ - GKO_ADAPT_HF(_macro(std::complex, std::complex)); \ - template _macro(std::complex, std::complex); \ - template _macro(std::complex, std::complex); \ - GKO_ADAPT_HF(_macro(std::complex, half)); \ - template _macro(std::complex, float); \ +#define GKO_INSTANTIATE_FOR_EACH_VALUE_AND_SCALAR_TYPE(_macro) \ + GKO_ADAPT_HF(_macro(half, half)); \ + GKO_ADAPT_HF(_macro(bfloat16, bfloat16)); \ + template _macro(float, float); \ + template _macro(double, double); \ + GKO_ADAPT_HF(_macro(std::complex, std::complex)); \ + GKO_ADAPT_HF(_macro(std::complex, std::complex)); \ + template _macro(std::complex, std::complex); \ + template _macro(std::complex, std::complex); \ + GKO_ADAPT_HF(_macro(std::complex, half)); \ + GKO_ADAPT_HF(_macro(std::complex, bfloat16)); \ + template _macro(std::complex, float); \ template _macro(std::complex, double) #endif @@ -557,19 +567,23 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, #if GINKGO_DPCPP_SINGLE_MODE #define GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_AND_INDEX_TYPE(_macro) \ GKO_ADAPT_HF(_macro(half, int32)); \ + GKO_ADAPT_HF(_macro(bfloat16, int32)); \ template _macro(float, int32); \ template <> \ _macro(double, int32) GKO_NOT_IMPLEMENTED; \ GKO_ADAPT_HF(_macro(half, int64)); \ + GKO_ADAPT_HF(_macro(bfloat16, int64)); \ template _macro(float, int64); \ template <> \ _macro(double, int64) GKO_NOT_IMPLEMENTED #else #define GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_AND_INDEX_TYPE(_macro) \ GKO_ADAPT_HF(_macro(half, int32)); \ + GKO_ADAPT_HF(_macro(bfloat16, int32)); \ template _macro(float, int32); \ template _macro(double, int32); \ GKO_ADAPT_HF(_macro(half, int64)); \ + GKO_ADAPT_HF(_macro(bfloat16, int64)); \ template _macro(float, int64); \ template _macro(double, int64) #endif @@ -587,10 +601,12 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, #define GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(_macro) \ GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_AND_INDEX_TYPE(_macro); \ GKO_ADAPT_HF(_macro(std::complex, int32)); \ + GKO_ADAPT_HF(_macro(std::complex, int32)); \ template _macro(std::complex, int32); \ template <> \ _macro(std::complex, int32) GKO_NOT_IMPLEMENTED; \ GKO_ADAPT_HF(_macro(std::complex, int64)); \ + GKO_ADAPT_HF(_macro(std::complex, int64)); \ template _macro(std::complex, int64); \ template <> \ _macro(std::complex, int64) GKO_NOT_IMPLEMENTED @@ -598,9 +614,11 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, #define GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(_macro) \ GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_AND_INDEX_TYPE(_macro); \ GKO_ADAPT_HF(_macro(std::complex, int32)); \ + GKO_ADAPT_HF(_macro(std::complex, int32)); \ template _macro(std::complex, int32); \ template _macro(std::complex, int32); \ GKO_ADAPT_HF(_macro(std::complex, int64)); \ + GKO_ADAPT_HF(_macro(std::complex, int64)); \ template _macro(std::complex, int64); \ template _macro(std::complex, int64) #endif @@ -621,6 +639,9 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, GKO_ADAPT_HF(_macro(half, int32, int32)); \ GKO_ADAPT_HF(_macro(half, int32, int64)); \ GKO_ADAPT_HF(_macro(half, int64, int64)); \ + GKO_ADAPT_HF(_macro(bfloat16, int32, int32)); \ + GKO_ADAPT_HF(_macro(bfloat16, int32, int64)); \ + GKO_ADAPT_HF(_macro(bfloat16, int64, int64)); \ template _macro(float, int32, int32); \ template _macro(float, int32, int64); \ template _macro(float, int64, int64); \ @@ -636,6 +657,9 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, GKO_ADAPT_HF(_macro(half, int32, int32)); \ GKO_ADAPT_HF(_macro(half, int32, int64)); \ GKO_ADAPT_HF(_macro(half, int64, int64)); \ + GKO_ADAPT_HF(_macro(bfloat16, int32, int32)); \ + GKO_ADAPT_HF(_macro(bfloat16, int32, int64)); \ + GKO_ADAPT_HF(_macro(bfloat16, int64, int64)); \ template _macro(float, int32, int32); \ template _macro(float, int32, int64); \ template _macro(float, int64, int64); \ @@ -660,6 +684,9 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, GKO_ADAPT_HF(_macro(std::complex, int32, int32)); \ GKO_ADAPT_HF(_macro(std::complex, int32, int64)); \ GKO_ADAPT_HF(_macro(std::complex, int64, int64)); \ + GKO_ADAPT_HF(_macro(std::complex, int32, int32)); \ + GKO_ADAPT_HF(_macro(std::complex, int32, int64)); \ + GKO_ADAPT_HF(_macro(std::complex, int64, int64)); \ template _macro(std::complex, int32, int32); \ template _macro(std::complex, int32, int64); \ template _macro(std::complex, int64, int64); \ @@ -676,6 +703,9 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, GKO_ADAPT_HF(_macro(std::complex, int32, int32)); \ GKO_ADAPT_HF(_macro(std::complex, int32, int64)); \ GKO_ADAPT_HF(_macro(std::complex, int64, int64)); \ + GKO_ADAPT_HF(_macro(std::complex, int32, int32)); \ + GKO_ADAPT_HF(_macro(std::complex, int32, int64)); \ + GKO_ADAPT_HF(_macro(std::complex, int64, int64)); \ template _macro(std::complex, int32, int32); \ template _macro(std::complex, int32, int64); \ template _macro(std::complex, int64, int64); \ @@ -686,38 +716,52 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, #if GINKGO_DPCPP_SINGLE_MODE -#define GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION(_macro) \ - template <> \ - _macro(float, double) GKO_NOT_IMPLEMENTED; \ - template <> \ - _macro(double, float) GKO_NOT_IMPLEMENTED; \ - template <> \ - _macro(half, double) GKO_NOT_IMPLEMENTED; \ - template <> \ - _macro(double, half) GKO_NOT_IMPLEMENTED; \ - GKO_ADAPT_HF(_macro(float, half)); \ - GKO_ADAPT_HF(_macro(half, float)); \ - GKO_ADAPT_HF(_macro(std::complex, std::complex)); \ - template <> \ - _macro(std::complex, std::complex) GKO_NOT_IMPLEMENTED; \ - GKO_ADAPT_HF(_macro(std::complex, std::complex)); \ - template <> \ - _macro(std::complex, std::complex) GKO_NOT_IMPLEMENTED; \ - template <> \ - _macro(std::complex, std::complex) GKO_NOT_IMPLEMENTED; \ - template <> \ +#define GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION(_macro) \ + template <> \ + _macro(float, double) GKO_NOT_IMPLEMENTED; \ + template <> \ + _macro(double, float) GKO_NOT_IMPLEMENTED; \ + template <> \ + _macro(half, double) GKO_NOT_IMPLEMENTED; \ + template <> \ + _macro(double, half) GKO_NOT_IMPLEMENTED; \ + template <> \ + _macro(bfloat16, double) GKO_NOT_IMPLEMENTED; \ + template <> \ + _macro(double, bfloat16) GKO_NOT_IMPLEMENTED; \ + GKO_ADAPT_HF(_macro(float, half)); \ + GKO_ADAPT_HF(_macro(half, float)); \ + GKO_ADAPT_HF(_macro(float, bfloat16)); \ + GKO_ADAPT_HF(_macro(bfloat16, float)); \ + GKO_ADAPT_HF(_macro(std::complex, std::complex)); \ + GKO_ADAPT_HF(_macro(std::complex, std::complex)); \ + template <> \ + _macro(std::complex, std::complex) GKO_NOT_IMPLEMENTED; \ + template <> \ + _macro(std::complex, std::complex) GKO_NOT_IMPLEMENTED; \ + GKO_ADAPT_HF(_macro(std::complex, std::complex)); \ + GKO_ADAPT_HF(_macro(std::complex, std::complex)); \ + template <> \ + _macro(std::complex, std::complex) GKO_NOT_IMPLEMENTED; \ + template <> \ + _macro(std::complex, std::complex) GKO_NOT_IMPLEMENTED; \ + template <> \ + _macro(std::complex, std::complex) GKO_NOT_IMPLEMENTED; \ + template <> \ _macro(std::complex, std::complex) GKO_NOT_IMPLEMENTED -#define GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION_OR_COPY(_macro) \ - GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION(_macro); \ - GKO_ADAPT_HF(_macro(half, half)); \ - template _macro(float, float); \ - template <> \ - _macro(double, double) GKO_NOT_IMPLEMENTED; \ - GKO_ADAPT_HF(_macro(std::complex, std::complex)); \ - template _macro(std::complex, std::complex); \ - template <> \ +#define GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION_OR_COPY(_macro) \ + GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION(_macro); \ + GKO_ADAPT_HF(_macro(half, half)); \ + GKO_ADAPT_HF(_macro(bfloat16, bfloat16)); \ + template _macro(float, float); \ + template <> \ + _macro(double, double) GKO_NOT_IMPLEMENTED; \ + GKO_ADAPT_HF(_macro(std::complex, std::complex)); \ + GKO_ADAPT_HF(_macro(std::complex, std::complex)); \ + template _macro(std::complex, std::complex); \ + template <> \ _macro(std::complex, std::complex) GKO_NOT_IMPLEMENTED #else /** @@ -729,18 +773,30 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, * Should take two arguments `src` and `dst`, which * are replaced by the source and destination value type. */ -#define GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION(_macro) \ - template _macro(float, double); \ - template _macro(double, float); \ - GKO_ADAPT_HF(_macro(half, double)); \ - GKO_ADAPT_HF(_macro(double, half)); \ - GKO_ADAPT_HF(_macro(float, half)); \ - GKO_ADAPT_HF(_macro(half, float)); \ - GKO_ADAPT_HF(_macro(std::complex, std::complex)); \ - GKO_ADAPT_HF(_macro(std::complex, std::complex)); \ - GKO_ADAPT_HF(_macro(std::complex, std::complex)); \ - GKO_ADAPT_HF(_macro(std::complex, std::complex)); \ - template _macro(std::complex, std::complex); \ +#define GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION(_macro) \ + template _macro(float, double); \ + template _macro(double, float); \ + GKO_ADAPT_HF(_macro(half, double)); \ + GKO_ADAPT_HF(_macro(double, half)); \ + GKO_ADAPT_HF(_macro(float, half)); \ + GKO_ADAPT_HF(_macro(half, float)); \ + GKO_ADAPT_HF(_macro(bfloat16, double)); \ + GKO_ADAPT_HF(_macro(double, bfloat16)); \ + GKO_ADAPT_HF(_macro(float, bfloat16)); \ + GKO_ADAPT_HF(_macro(bfloat16, float)); \ + GKO_ADAPT_HF(_macro(bfloat16, half)); \ + GKO_ADAPT_HF(_macro(half, bfloat16)); \ + GKO_ADAPT_HF(_macro(std::complex, std::complex)); \ + GKO_ADAPT_HF(_macro(std::complex, std::complex)); \ + GKO_ADAPT_HF(_macro(std::complex, std::complex)); \ + GKO_ADAPT_HF(_macro(std::complex, std::complex)); \ + GKO_ADAPT_HF(_macro(std::complex, std::complex)); \ + GKO_ADAPT_HF(_macro(std::complex, std::complex)); \ + GKO_ADAPT_HF(_macro(std::complex, std::complex)); \ + GKO_ADAPT_HF(_macro(std::complex, std::complex)); \ + GKO_ADAPT_HF(_macro(std::complex, std::complex)); \ + GKO_ADAPT_HF(_macro(std::complex, std::complex)); \ + template _macro(std::complex, std::complex); \ template _macro(std::complex, std::complex) @@ -753,13 +809,15 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, * Should take two arguments `src` and `dst`, which * are replaced by the source and destination value type. */ -#define GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION_OR_COPY(_macro) \ - GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION(_macro); \ - GKO_ADAPT_HF(_macro(half, half)); \ - template _macro(float, float); \ - template _macro(double, double); \ - GKO_ADAPT_HF(_macro(std::complex, std::complex)); \ - template _macro(std::complex, std::complex); \ +#define GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION_OR_COPY(_macro) \ + GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION(_macro); \ + GKO_ADAPT_HF(_macro(half, half)); \ + GKO_ADAPT_HF(_macro(bfloat16, bfloat16)); \ + template _macro(float, float); \ + template _macro(double, double); \ + GKO_ADAPT_HF(_macro(std::complex, std::complex)); \ + GKO_ADAPT_HF(_macro(std::complex, std::complex)); \ + template _macro(std::complex, std::complex); \ template _macro(std::complex, std::complex) #endif @@ -772,15 +830,18 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, * Should take two arguments, which are replaced by the * value and index types. */ -#define GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_PAIR(_macro) \ - GKO_ADAPT_HF(_macro(half, half)); \ - template _macro(float, float); \ - template _macro(double, double); \ - GKO_ADAPT_HF(_macro(std::complex, half)); \ - template _macro(std::complex, float); \ - template _macro(std::complex, double); \ - GKO_ADAPT_HF(_macro(std::complex, std::complex)); \ - template _macro(std::complex, std::complex); \ +#define GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_PAIR(_macro) \ + GKO_ADAPT_HF(_macro(half, half)); \ + GKO_ADAPT_HF(_macro(bfloat16, bfloat16)); \ + template _macro(float, float); \ + template _macro(double, double); \ + GKO_ADAPT_HF(_macro(std::complex, half)); \ + GKO_ADAPT_HF(_macro(std::complex, bfloat16)); \ + template _macro(std::complex, float); \ + template _macro(std::complex, double); \ + GKO_ADAPT_HF(_macro(std::complex, std::complex)); \ + GKO_ADAPT_HF(_macro(std::complex, std::complex)); \ + template _macro(std::complex, std::complex); \ template _macro(std::complex, std::complex) @@ -793,18 +854,20 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, * Should take two arguments, which are replaced by the * value and index types. */ -#define GKO_INSTANTIATE_FOR_EACH_COMBINED_VALUE_AND_INDEX_TYPE(_macro) \ - template _macro(char, char); \ - template _macro(int32, int32); \ - template _macro(int64, int64); \ - template _macro(unsigned int, unsigned int); \ - template _macro(unsigned long, unsigned long); \ - GKO_ADAPT_HF(_macro(half, half)); \ - template _macro(float, float); \ - template _macro(double, double); \ - template _macro(long double, long double); \ - GKO_ADAPT_HF(_macro(std::complex, std::complex)); \ - template _macro(std::complex, std::complex); \ +#define GKO_INSTANTIATE_FOR_EACH_COMBINED_VALUE_AND_INDEX_TYPE(_macro) \ + template _macro(char, char); \ + template _macro(int32, int32); \ + template _macro(int64, int64); \ + template _macro(unsigned int, unsigned int); \ + template _macro(unsigned long, unsigned long); \ + GKO_ADAPT_HF(_macro(half, half)); \ + GKO_ADAPT_HF(_macro(bfloat16, bfloat16)); \ + template _macro(float, float); \ + template _macro(double, double); \ + template _macro(long double, long double); \ + GKO_ADAPT_HF(_macro(std::complex, std::complex)); \ + GKO_ADAPT_HF(_macro(std::complex, std::complex)); \ + template _macro(std::complex, std::complex); \ template _macro(std::complex, std::complex) /** @@ -817,9 +880,11 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, */ #define GKO_INSTANTIATE_FOR_EACH_POD_TYPE(_macro) \ GKO_ADAPT_HF(_macro(half)); \ + GKO_ADAPT_HF(_macro(bfloat16)); \ template _macro(float); \ template _macro(double); \ GKO_ADAPT_HF(_macro(std::complex)); \ + GKO_ADAPT_HF(_macro(std::complex)); \ template _macro(std::complex); \ template _macro(std::complex); \ template _macro(size_type); \ diff --git a/include/ginkgo/core/distributed/matrix.hpp b/include/ginkgo/core/distributed/matrix.hpp index ba81c959660..2a351c31321 100644 --- a/include/ginkgo/core/distributed/matrix.hpp +++ b/include/ginkgo/core/distributed/matrix.hpp @@ -271,6 +271,8 @@ class Matrix #if GINKGO_ENABLE_HALF public ConvertibleTo>, LocalIndexType, GlobalIndexType>>, + public ConvertibleTo, LocalIndexType, + GlobalIndexType>>, #endif public DistributedBase { friend class EnableCreateMethod; @@ -315,6 +317,18 @@ class Matrix void move_to(Matrix>, local_index_type, global_index_type>* result) override; + friend class Matrix, LocalIndexType, + GlobalIndexType>; + using ConvertibleTo, local_index_type, + global_index_type>>::convert_to; + using ConvertibleTo, local_index_type, + global_index_type>>::move_to; + + void convert_to(Matrix, local_index_type, + global_index_type>* result) const override; + + void move_to(Matrix, local_index_type, + global_index_type>* result) override; #endif /** * Reads a square matrix from the device_matrix_data structure and a global diff --git a/include/ginkgo/core/distributed/vector.hpp b/include/ginkgo/core/distributed/vector.hpp index b36bcd6444e..bd7778dc51c 100644 --- a/include/ginkgo/core/distributed/vector.hpp +++ b/include/ginkgo/core/distributed/vector.hpp @@ -90,6 +90,7 @@ class Vector public ConvertibleTo>>, #if GINKGO_ENABLE_HALF public ConvertibleTo>>>, + public ConvertibleTo>>, #endif public EnableAbsoluteComputation>>, public DistributedBase { @@ -208,6 +209,15 @@ class Vector void move_to( Vector>>* result) override; + + friend class Vector>; + using ConvertibleTo>>::convert_to; + using ConvertibleTo>>::move_to; + + void convert_to( + Vector>* result) const override; + + void move_to(Vector>* result) override; #endif std::unique_ptr compute_absolute() const override; @@ -684,6 +694,16 @@ struct conversion_target_helper> { return target_type::create(source->get_executor(), source->get_communicator()); } + + using trd_source_type = + experimental::distributed::Vector>; + + static std::unique_ptr create_empty( + const trd_source_type* source) + { + return target_type::create(source->get_executor(), + source->get_communicator()); + } #endif }; diff --git a/include/ginkgo/core/matrix/coo.hpp b/include/ginkgo/core/matrix/coo.hpp index b3435a22648..217e50980d1 100644 --- a/include/ginkgo/core/matrix/coo.hpp +++ b/include/ginkgo/core/matrix/coo.hpp @@ -81,6 +81,7 @@ class Coo : public EnableLinOp>, #if GINKGO_ENABLE_HALF public ConvertibleTo< Coo>, IndexType>>, + public ConvertibleTo, IndexType>>, #endif public ConvertibleTo>, public ConvertibleTo>, @@ -134,6 +135,17 @@ class Coo : public EnableLinOp>, void move_to(Coo>, IndexType>* result) override; + + friend class Coo, IndexType>; + using ConvertibleTo< + Coo, IndexType>>::convert_to; + using ConvertibleTo, IndexType>>::move_to; + + void convert_to( + Coo, IndexType>* result) const override; + + void move_to( + Coo, IndexType>* result) override; #endif void convert_to(Csr* other) const override; diff --git a/include/ginkgo/core/matrix/csr.hpp b/include/ginkgo/core/matrix/csr.hpp index d95b438b09a..28ebc31f05e 100644 --- a/include/ginkgo/core/matrix/csr.hpp +++ b/include/ginkgo/core/matrix/csr.hpp @@ -130,6 +130,7 @@ class Csr : public EnableLinOp>, #if GINKGO_ENABLE_HALF public ConvertibleTo< Csr>, IndexType>>, + public ConvertibleTo, IndexType>>, #endif public ConvertibleTo>, public ConvertibleTo>, @@ -740,6 +741,18 @@ class Csr : public EnableLinOp>, void move_to(Csr>, IndexType>* result) override; + + + friend class Csr, IndexType>; + using ConvertibleTo< + Csr, IndexType>>::convert_to; + using ConvertibleTo, IndexType>>::move_to; + + void convert_to( + Csr, IndexType>* result) const override; + + void move_to( + Csr, IndexType>* result) override; #endif void convert_to(Dense* other) const override; diff --git a/include/ginkgo/core/matrix/dense.hpp b/include/ginkgo/core/matrix/dense.hpp index 317f5cc5668..374b00b9eb5 100644 --- a/include/ginkgo/core/matrix/dense.hpp +++ b/include/ginkgo/core/matrix/dense.hpp @@ -111,6 +111,7 @@ class Dense public ConvertibleTo>>, #if GINKGO_ENABLE_HALF public ConvertibleTo>>>, + public ConvertibleTo>>, #endif public ConvertibleTo>, public ConvertibleTo>, @@ -318,6 +319,15 @@ class Dense void move_to( Dense>>* result) override; + + friend class Dense>; + using ConvertibleTo>>::convert_to; + using ConvertibleTo>>::move_to; + + void convert_to( + Dense>* result) const override; + + void move_to(Dense>* result) override; #endif void convert_to(Coo* result) const override; diff --git a/include/ginkgo/core/matrix/diagonal.hpp b/include/ginkgo/core/matrix/diagonal.hpp index d7ff95aa9f1..de34978b309 100644 --- a/include/ginkgo/core/matrix/diagonal.hpp +++ b/include/ginkgo/core/matrix/diagonal.hpp @@ -73,6 +73,7 @@ class Diagonal public ConvertibleTo>>, #if GINKGO_ENABLE_HALF public ConvertibleTo>>>, + public ConvertibleTo>>, #endif public Transposable, public WritableToMatrixData, @@ -126,6 +127,15 @@ class Diagonal void move_to( Diagonal>>* result) override; + + friend class Diagonal>; + using ConvertibleTo>>::convert_to; + using ConvertibleTo>>::move_to; + + void convert_to( + Diagonal>* result) const override; + + void move_to(Diagonal>* result) override; #endif void convert_to(Csr* result) const override; diff --git a/include/ginkgo/core/matrix/ell.hpp b/include/ginkgo/core/matrix/ell.hpp index b696f8418e6..5e948b3440c 100644 --- a/include/ginkgo/core/matrix/ell.hpp +++ b/include/ginkgo/core/matrix/ell.hpp @@ -83,6 +83,7 @@ class Ell : public EnableLinOp>, #if GINKGO_ENABLE_HALF public ConvertibleTo< Ell>, IndexType>>, + public ConvertibleTo, IndexType>>, #endif public ConvertibleTo>, public ConvertibleTo>, @@ -135,6 +136,17 @@ class Ell : public EnableLinOp>, void move_to(Ell>, IndexType>* result) override; + + friend class Ell, IndexType>; + using ConvertibleTo< + Ell, IndexType>>::convert_to; + using ConvertibleTo, IndexType>>::move_to; + + void convert_to( + Ell, IndexType>* result) const override; + + void move_to( + Ell, IndexType>* result) override; #endif void convert_to(Dense* other) const override; diff --git a/include/ginkgo/core/matrix/fbcsr.hpp b/include/ginkgo/core/matrix/fbcsr.hpp index 05aa87833f5..e9d019cec5f 100644 --- a/include/ginkgo/core/matrix/fbcsr.hpp +++ b/include/ginkgo/core/matrix/fbcsr.hpp @@ -124,22 +124,24 @@ inline IndexType get_num_blocks(const int block_size, const IndexType size) * @ingroup LinOp */ template -class Fbcsr : public EnableLinOp>, - public EnableCreateMethod>, - public ConvertibleTo, IndexType>>, +class Fbcsr + : public EnableLinOp>, + public EnableCreateMethod>, + public ConvertibleTo, IndexType>>, #if GINKGO_ENABLE_HALF - public ConvertibleTo< - Fbcsr>, IndexType>>, + public ConvertibleTo< + Fbcsr>, IndexType>>, + public ConvertibleTo, IndexType>>, #endif - public ConvertibleTo>, - public ConvertibleTo>, - public ConvertibleTo>, - public DiagonalExtractable, - public ReadableFromMatrixData, - public WritableToMatrixData, - public Transposable, - public EnableAbsoluteComputation< - remove_complex>> { + public ConvertibleTo>, + public ConvertibleTo>, + public ConvertibleTo>, + public DiagonalExtractable, + public ReadableFromMatrixData, + public WritableToMatrixData, + public Transposable, + public EnableAbsoluteComputation< + remove_complex>> { friend class EnableCreateMethod; friend class EnablePolymorphicObject; friend class Csr; @@ -199,6 +201,18 @@ class Fbcsr : public EnableLinOp>, void move_to(Fbcsr>, IndexType>* result) override; + + friend class Fbcsr, IndexType>; + using ConvertibleTo< + Fbcsr, IndexType>>::convert_to; + using ConvertibleTo< + Fbcsr, IndexType>>::move_to; + + void convert_to( + Fbcsr, IndexType>* result) const override; + + void move_to( + Fbcsr, IndexType>* result) override; #endif void convert_to(Dense* other) const override; diff --git a/include/ginkgo/core/matrix/hybrid.hpp b/include/ginkgo/core/matrix/hybrid.hpp index cfa72d9a693..6c17004d3a5 100644 --- a/include/ginkgo/core/matrix/hybrid.hpp +++ b/include/ginkgo/core/matrix/hybrid.hpp @@ -75,6 +75,7 @@ class Hybrid #if GINKGO_ENABLE_HALF public ConvertibleTo< Hybrid>, IndexType>>, + public ConvertibleTo, IndexType>>, #endif public ConvertibleTo>, public ConvertibleTo>, @@ -410,6 +411,18 @@ class Hybrid void move_to(Hybrid>, IndexType>* result) override; + + friend class Hybrid, IndexType>; + using ConvertibleTo< + Hybrid, IndexType>>::convert_to; + using ConvertibleTo< + Hybrid, IndexType>>::move_to; + + void convert_to(Hybrid, IndexType>* result) + const override; + + void move_to( + Hybrid, IndexType>* result) override; #endif void convert_to(Dense* other) const override; diff --git a/include/ginkgo/core/matrix/sellp.hpp b/include/ginkgo/core/matrix/sellp.hpp index 030301fe830..2e7f93cff4b 100644 --- a/include/ginkgo/core/matrix/sellp.hpp +++ b/include/ginkgo/core/matrix/sellp.hpp @@ -68,20 +68,22 @@ class Csr; * @ingroup LinOp */ template -class Sellp : public EnableLinOp>, - public EnableCreateMethod>, - public ConvertibleTo, IndexType>>, +class Sellp + : public EnableLinOp>, + public EnableCreateMethod>, + public ConvertibleTo, IndexType>>, #if GINKGO_ENABLE_HALF - public ConvertibleTo< - Sellp>, IndexType>>, + public ConvertibleTo< + Sellp>, IndexType>>, + public ConvertibleTo, IndexType>>, #endif - public ConvertibleTo>, - public ConvertibleTo>, - public DiagonalExtractable, - public ReadableFromMatrixData, - public WritableToMatrixData, - public EnableAbsoluteComputation< - remove_complex>> { + public ConvertibleTo>, + public ConvertibleTo>, + public DiagonalExtractable, + public ReadableFromMatrixData, + public WritableToMatrixData, + public EnableAbsoluteComputation< + remove_complex>> { friend class EnableCreateMethod; friend class EnablePolymorphicObject; friend class Dense; @@ -126,6 +128,18 @@ class Sellp : public EnableLinOp>, void move_to(Sellp>, IndexType>* result) override; + + friend class Sellp, IndexType>; + using ConvertibleTo< + Sellp, IndexType>>::convert_to; + using ConvertibleTo< + Sellp, IndexType>>::move_to; + + void convert_to( + Sellp, IndexType>* result) const override; + + void move_to( + Sellp, IndexType>* result) override; #endif void convert_to(Dense* other) const override; diff --git a/omp/components/atomic.hpp b/omp/components/atomic.hpp index 8d5e1749974..ceb5bda6661 100644 --- a/omp/components/atomic.hpp +++ b/omp/components/atomic.hpp @@ -105,6 +105,26 @@ void atomic_add(half& out, half val) } +template <> +void atomic_add(bfloat16& out, bfloat16 val) +{ + // UB? + uint16_t* address_as_converter = reinterpret_cast(&out); + uint16_t old = *address_as_converter; + uint16_t assumed; + do { + assumed = old; + auto answer = + reinterpret(reinterpret(assumed) + val); +#pragma omp atomic capture + { + old = *address_as_converter; + *address_as_converter = (old == assumed) ? answer : old; + } + } while (assumed != old); +} + + } // namespace omp } // namespace kernels } // namespace gko From d87dc17494adde51cad9df5ad9534a2930e61fad Mon Sep 17 00:00:00 2001 From: "Yu-Hsiang M. Tsai" Date: Mon, 11 Sep 2023 16:59:43 +0200 Subject: [PATCH 39/48] some trick for cuda arch < 8.0 --- common/cuda_hip/components/warp_blas.hpp.inc | 4 +-- common/cuda_hip/matrix/ell_kernels.hpp.inc | 2 +- .../unified/components/fill_array_kernels.cpp | 7 ++-- .../unified/solver/common_gmres_kernels.cpp | 6 ++-- include/ginkgo/core/base/half.hpp | 33 +++++++++++++++++++ 5 files changed, 44 insertions(+), 8 deletions(-) diff --git a/common/cuda_hip/components/warp_blas.hpp.inc b/common/cuda_hip/components/warp_blas.hpp.inc index 195f65ffd68..40e1af7a500 100644 --- a/common/cuda_hip/components/warp_blas.hpp.inc +++ b/common/cuda_hip/components/warp_blas.hpp.inc @@ -69,7 +69,7 @@ __device__ __forceinline__ void apply_gauss_jordan_transform( if (group.thread_rank() == key_row) { key_col_elem = one() / key_col_elem; } else { - key_col_elem = -row[key_col] / key_col_elem; + key_col_elem = zero() - row[key_col] / key_col_elem; } #pragma unroll for (int32 i = 0; i < max_problem_size; ++i) { @@ -115,7 +115,7 @@ __device__ __forceinline__ void apply_gauss_jordan_transform_with_rhs( key_col_elem = one() / key_col_elem; rhs[0] = key_rhs_elem * key_col_elem; } else { - key_col_elem = -row[key_col] / key_col_elem; + key_col_elem = zero() - row[key_col] / key_col_elem; rhs[0] += key_rhs_elem * key_col_elem; } #pragma unroll diff --git a/common/cuda_hip/matrix/ell_kernels.hpp.inc b/common/cuda_hip/matrix/ell_kernels.hpp.inc index 6c81fb4964c..8e8d6e89ae3 100644 --- a/common/cuda_hip/matrix/ell_kernels.hpp.inc +++ b/common/cuda_hip/matrix/ell_kernels.hpp.inc @@ -72,7 +72,7 @@ __device__ void spmv_kernel( OutputValueType, default_block_size / num_thread_per_worker> storage; if (idx_in_worker == 0) { - storage[threadIdx.x] = 0; + storage[threadIdx.x] = gko::zero(); } __syncthreads(); auto temp = zero(); diff --git a/common/unified/components/fill_array_kernels.cpp b/common/unified/components/fill_array_kernels.cpp index 04167661a4d..97dc9faeb62 100644 --- a/common/unified/components/fill_array_kernels.cpp +++ b/common/unified/components/fill_array_kernels.cpp @@ -35,7 +35,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common/unified/base/kernel_launch.hpp" - namespace gko { namespace kernels { namespace GKO_DEVICE_NAMESPACE { @@ -64,7 +63,11 @@ void fill_seq_array(std::shared_ptr exec, run_kernel( exec, [] GKO_KERNEL(auto idx, auto array) { - array[idx] = static_cast(idx); + array[idx] = static_cast, __nv_bfloat16>::value || + std::is_same, + thrust::complex<__nv_bfloat16>>::value, + float, long long>::type>(idx); }, n, array); } diff --git a/common/unified/solver/common_gmres_kernels.cpp b/common/unified/solver/common_gmres_kernels.cpp index 94646cc477f..7c00df081f9 100644 --- a/common/unified/solver/common_gmres_kernels.cpp +++ b/common/unified/solver/common_gmres_kernels.cpp @@ -117,7 +117,7 @@ void hessenberg_qr(std::shared_ptr exec, const auto gc = givens_cos(j, rhs); const auto gs = givens_sin(j, rhs); const auto out1 = gc * hess_this + gs * hess_next; - const auto out2 = -conj(gs) * hess_this + conj(gc) * hess_next; + const auto out2 = conj(gc) * hess_next - conj(gs) * hess_this; hessenberg_iter(j, rhs) = out1; hessenberg_iter(j + 1, rhs) = hess_this = out2; hess_next = hessenberg_iter(j + 2, rhs); @@ -143,8 +143,8 @@ void hessenberg_qr(std::shared_ptr exec, hessenberg_iter(iter, rhs) = gc * hess_this + gs * hess_next; hessenberg_iter(iter + 1, rhs) = zero(); // apply new Givens rotation to RHS of least-squares problem - const auto rnc_new = - -conj(gs) * residual_norm_collection(iter, rhs); + const auto rnc_new = zero() - + conj(gs) * residual_norm_collection(iter, rhs); residual_norm_collection(iter + 1, rhs) = rnc_new; residual_norm_collection(iter, rhs) = gc * residual_norm_collection(iter, rhs); diff --git a/include/ginkgo/core/base/half.hpp b/include/ginkgo/core/base/half.hpp index 4d6858dbab1..c5b0f09a8ad 100644 --- a/include/ginkgo/core/base/half.hpp +++ b/include/ginkgo/core/base/half.hpp @@ -46,6 +46,39 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include + +#if defined(__CUDACC__) + +#define BFLOAT_FRIEND_OPERATOR(_op, _opeq) \ + __forceinline__ __device__ __nv_bfloat16 operator _op( \ + const __nv_bfloat16& lhs, const __nv_bfloat16& rhs) \ + { \ + return static_cast<__nv_bfloat16>(static_cast(lhs) \ + _op static_cast(rhs)); \ + } \ + __forceinline__ __device__ __nv_bfloat16& operator _opeq( \ + __nv_bfloat16& lhs, const __nv_bfloat16& rhs) \ + { \ + lhs = static_cast(lhs) _op static_cast(rhs); \ + return lhs; \ + } +BFLOAT_FRIEND_OPERATOR(+, +=) +BFLOAT_FRIEND_OPERATOR(-, -=) +BFLOAT_FRIEND_OPERATOR(*, *=) +BFLOAT_FRIEND_OPERATOR(/, /=) + +__forceinline__ __device__ __nv_bfloat16 operator+(const __nv_bfloat16& h) +{ + return h; +} +__forceinline__ __device__ __nv_bfloat16 operator-(const __nv_bfloat16& h) +{ + return -float{h}; +} +#undef BFLOAT_FRIEND_OPERATOR + +#endif + class hip_bfloat16; From 5f486818ef1a013999201d5255396d1ccf78da67 Mon Sep 17 00:00:00 2001 From: "Yu-Hsiang M. Tsai" Date: Tue, 12 Sep 2023 11:00:48 +0200 Subject: [PATCH 40/48] fix the missing type conversion --- core/preconditioner/jacobi.cpp | 13 +++++++------ include/ginkgo/core/base/precision_dispatch.hpp | 13 +++++++++++-- 2 files changed, 18 insertions(+), 8 deletions(-) diff --git a/core/preconditioner/jacobi.cpp b/core/preconditioner/jacobi.cpp index 75f5e941303..717385fdb9a 100644 --- a/core/preconditioner/jacobi.cpp +++ b/core/preconditioner/jacobi.cpp @@ -317,12 +317,13 @@ void Jacobi::generate(const LinOp* system_matrix, if (parameters_.max_block_size == 1) { auto diag = share(as(system_matrix) ->extract_diagonal_linop()); - auto diag_vt = - ::gko::detail::temporary_conversion>:: - template create>, - matrix::Diagonal>>>( - diag.get()); + auto diag_vt = ::gko::detail:: + temporary_conversion>::template create< + matrix::Diagonal>, + matrix::Diagonal< + previous_precision>>, + matrix::Diagonal>>( + diag.get()); if (!diag_vt) { GKO_NOT_SUPPORTED(system_matrix); } diff --git a/include/ginkgo/core/base/precision_dispatch.hpp b/include/ginkgo/core/base/precision_dispatch.hpp index 5a1e0ab9175..a1080eecb12 100644 --- a/include/ginkgo/core/base/precision_dispatch.hpp +++ b/include/ginkgo/core/base/precision_dispatch.hpp @@ -79,10 +79,12 @@ make_temporary_conversion(Ptr&& matrix) using NextDense = matrix::Dense>; using NextNextDense = matrix::Dense>>; + using NextNextNextDense = matrix::Dense>; using MaybeConstDense = std::conditional_t::value, const Dense, Dense>; - auto result = detail::temporary_conversion< - MaybeConstDense>::template create(matrix); + auto result = + detail::temporary_conversion::template create< + NextDense, NextNextDense, NextNextNextDense>(matrix); if (!result) { GKO_NOT_SUPPORTED(matrix); } @@ -258,6 +260,7 @@ void mixed_precision_dispatch(Function fn, const LinOp* in, LinOp* out) using fst_type = matrix::Dense; using snd_type = matrix::Dense>; using trd_type = matrix::Dense>>; + using fth_type = matrix::Dense>; if (auto dense_in = dynamic_cast(in)) { if (auto dense_out = dynamic_cast(out)) { fn(dense_in, dense_out); @@ -265,6 +268,8 @@ void mixed_precision_dispatch(Function fn, const LinOp* in, LinOp* out) fn(dense_in, dense_out); } else if (auto dense_out = dynamic_cast(out)) { fn(dense_in, dense_out); + } else if (auto dense_out = dynamic_cast(out)) { + fn(dense_in, dense_out); } else { GKO_NOT_SUPPORTED(out); } @@ -275,6 +280,8 @@ void mixed_precision_dispatch(Function fn, const LinOp* in, LinOp* out) fn(dense_in, dense_out); } else if (auto dense_out = dynamic_cast(out)) { fn(dense_in, dense_out); + } else if (auto dense_out = dynamic_cast(out)) { + fn(dense_in, dense_out); } else { GKO_NOT_SUPPORTED(out); } @@ -285,6 +292,8 @@ void mixed_precision_dispatch(Function fn, const LinOp* in, LinOp* out) fn(dense_in, dense_out); } else if (auto dense_out = dynamic_cast(out)) { fn(dense_in, dense_out); + } else if (auto dense_out = dynamic_cast(out)) { + fn(dense_in, dense_out); } else { GKO_NOT_SUPPORTED(out); } From 316775fca8bddc4316d5e213a0f4dfb8eee687d1 Mon Sep 17 00:00:00 2001 From: "Yuhsiang M. Tsai" Date: Mon, 2 Oct 2023 01:33:03 +0200 Subject: [PATCH 41/48] enable bfloat16 test and fix/skip --- core/test/utils.hpp | 27 ++++++++++++------- core/test/utils/array_generator_test.cpp | 6 +++-- core/test/utils/matrix_utils_test.cpp | 12 ++++----- core/test/utils/value_generator_test.cpp | 4 +-- .../ginkgo/core/base/precision_dispatch.hpp | 8 ++++-- reference/test/solver/bicgstab_kernels.cpp | 2 ++ reference/test/solver/cgs_kernels.cpp | 4 +++ test/components/reduce_array_kernels.cpp | 7 ++++- test/matrix/fbcsr_kernels.cpp | 4 +++ test/mpi/matrix.cpp | 4 +-- 10 files changed, 53 insertions(+), 25 deletions(-) diff --git a/core/test/utils.hpp b/core/test/utils.hpp index 554568ed81a..1982aae20a5 100644 --- a/core/test/utils.hpp +++ b/core/test/utils.hpp @@ -70,10 +70,10 @@ namespace test { using ValueTypes = #if GINKGO_DPCPP_SINGLE_MODE - ::testing::Types) std::complex>; #else - ::testing::Types) std::complex, std::complex>; #endif @@ -102,9 +102,9 @@ using ComplexValueTypesNoHalf = using RealValueTypes = #if GINKGO_DPCPP_SINGLE_MODE - ::testing::Types; + ::testing::Types; #else - ::testing::Types; + ::testing::Types; #endif @@ -126,11 +126,11 @@ using PODTypes = using ValueAndIndexTypes = #if GINKGO_DPCPP_SINGLE_MODE - ::testing::Types) std::complex, gko::int32, gko::int64, gko::size_type>; #else - ::testing::Types) std::complex, std::complex, gko::int32, gko::int64, gko::size_type>; @@ -139,11 +139,11 @@ using ValueAndIndexTypes = using RealValueAndIndexTypes = #if GINKGO_DPCPP_SINGLE_MODE - ::testing::Types; + ::testing::Types; #else - ::testing::Types; + ::testing::Types; #endif @@ -396,5 +396,12 @@ using next_precision = typename detail::next_precision_impl::type; "This assert is used to counter the false positive extra " \ "semi-colon warnings") +#define SKIP_IF_BFLOAT16(type) \ + if (std::is_same, gko::bfloat16>::value) { \ + GTEST_SKIP() << "Skip due to bfloat16 mode"; \ + } \ + static_assert(true, \ + "This assert is used to counter the false positive extra " \ + "semi-colon warnings") #endif // GKO_CORE_TEST_UTILS_HPP_ diff --git a/core/test/utils/array_generator_test.cpp b/core/test/utils/array_generator_test.cpp index 018652f88a4..e38fa72aaed 100644 --- a/core/test/utils/array_generator_test.cpp +++ b/core/test/utils/array_generator_test.cpp @@ -65,8 +65,10 @@ class ArrayGenerator : public ::testing::Test { InputIterator sample_end, Closure closure_op) { using std::pow; - ValueType res = 0; - ValueType num_elems = 0; + // use double to avoid rounding error + double res = 0; + // can not use ValueType when it is bfloat16 + int num_elems = 0; while (sample_start != sample_end) { auto tmp = *(sample_start++); res += pow(closure_op(tmp) - c, n); diff --git a/core/test/utils/matrix_utils_test.cpp b/core/test/utils/matrix_utils_test.cpp index 1640d1310c6..c4a2e26af6b 100644 --- a/core/test/utils/matrix_utils_test.cpp +++ b/core/test/utils/matrix_utils_test.cpp @@ -258,9 +258,9 @@ TYPED_TEST(MatrixUtils, MakeHpdMatrixCorrectly) using T = typename TestFixture::value_type; auto cpy_data = this->data; - gko::utils::make_hpd(this->data, 1.001); + gko::utils::make_hpd(this->data, 1.01); gko::utils::make_hermitian(cpy_data); - gko::utils::make_diag_dominant(cpy_data, 1.001); + gko::utils::make_diag_dominant(cpy_data, 1.01); auto mtx = TestFixture::mtx_type::create(this->exec); mtx->read(this->data); @@ -273,7 +273,7 @@ TYPED_TEST(MatrixUtils, MakeHpdMatrixCorrectly) TYPED_TEST(MatrixUtils, MakeHpdMatrixWithRatioCorrectly) { using T = typename TestFixture::value_type; - gko::remove_complex ratio = 1.002; + gko::remove_complex ratio = 1.02; auto cpy_data = this->data; gko::utils::make_hpd(this->data, ratio); @@ -293,9 +293,9 @@ TYPED_TEST(MatrixUtils, MakeSpdMatrixCorrectly) using T = typename TestFixture::value_type; auto cpy_data = this->data; - gko::utils::make_spd(this->data, 1.001); + gko::utils::make_spd(this->data, 1.01); gko::utils::make_symmetric(cpy_data); - gko::utils::make_diag_dominant(cpy_data, 1.001); + gko::utils::make_diag_dominant(cpy_data, 1.01); auto mtx = TestFixture::mtx_type::create(this->exec); mtx->read(this->data); @@ -308,7 +308,7 @@ TYPED_TEST(MatrixUtils, MakeSpdMatrixCorrectly) TYPED_TEST(MatrixUtils, MakeSpdMatrixWithRatioCorrectly) { using T = typename TestFixture::value_type; - gko::remove_complex ratio = 1.002; + gko::remove_complex ratio = 1.02; auto cpy_data = this->data; gko::utils::make_spd(this->data, ratio); diff --git a/core/test/utils/value_generator_test.cpp b/core/test/utils/value_generator_test.cpp index c65cab1cce3..61cd4d7f809 100644 --- a/core/test/utils/value_generator_test.cpp +++ b/core/test/utils/value_generator_test.cpp @@ -59,8 +59,8 @@ class ValueGenerator : public ::testing::Test { InputIterator sample_end, Closure closure_op) { using std::pow; - ValueType res = 0; - ValueType num_elems = 0; + double res = 0; + int num_elems = 0; while (sample_start != sample_end) { auto tmp = *(sample_start++); res += pow(closure_op(tmp) - c, n); diff --git a/include/ginkgo/core/base/precision_dispatch.hpp b/include/ginkgo/core/base/precision_dispatch.hpp index a1080eecb12..e63c05fec30 100644 --- a/include/ginkgo/core/base/precision_dispatch.hpp +++ b/include/ginkgo/core/base/precision_dispatch.hpp @@ -393,7 +393,9 @@ make_temporary_conversion(LinOp* matrix) template create< experimental::distributed::Vector>, experimental::distributed::Vector< - next_precision>>>(matrix); + next_precision>>, + experimental::distributed::Vector>>( + matrix); if (!result) { GKO_NOT_SUPPORTED(matrix); } @@ -413,7 +415,9 @@ make_temporary_conversion(const LinOp* matrix) template create< experimental::distributed::Vector>, experimental::distributed::Vector< - next_precision>>>(matrix); + next_precision>>, + experimental::distributed::Vector>>( + matrix); if (!result) { GKO_NOT_SUPPORTED(matrix); } diff --git a/reference/test/solver/bicgstab_kernels.cpp b/reference/test/solver/bicgstab_kernels.cpp index 9b0ea1f6c80..bdc7ce58516 100644 --- a/reference/test/solver/bicgstab_kernels.cpp +++ b/reference/test/solver/bicgstab_kernels.cpp @@ -625,6 +625,7 @@ TYPED_TEST(Bicgstab, SolvesBigDenseSystemForDivergenceCheck1) using Mtx = typename TestFixture::Mtx; using value_type = typename TestFixture::value_type; SKIP_IF_HALF(value_type); + SKIP_IF_BFLOAT16(value_type); auto half_tol = std::sqrt(r::value); std::shared_ptr locmtx = gko::initialize({{-19.0, 47.0, -41.0, 35.0, -21.0, 71.0}, @@ -654,6 +655,7 @@ TYPED_TEST(Bicgstab, SolvesBigDenseSystemForDivergenceCheck2) using Mtx = typename TestFixture::Mtx; using value_type = typename TestFixture::value_type; SKIP_IF_HALF(value_type); + SKIP_IF_BFLOAT16(value_type); auto half_tol = std::sqrt(r::value); std::shared_ptr locmtx = gko::initialize({{-19.0, 47.0, -41.0, 35.0, -21.0, 71.0}, diff --git a/reference/test/solver/cgs_kernels.cpp b/reference/test/solver/cgs_kernels.cpp index 74bf08d08e8..4a3f5001a1d 100644 --- a/reference/test/solver/cgs_kernels.cpp +++ b/reference/test/solver/cgs_kernels.cpp @@ -320,6 +320,7 @@ TYPED_TEST(Cgs, SolvesDenseSystem) { using Mtx = typename TestFixture::Mtx; using value_type = typename TestFixture::value_type; + SKIP_IF_BFLOAT16(value_type); auto solver = this->cgs_factory->generate(this->mtx); auto b = gko::initialize({-1.0, 3.0, 1.0}, this->exec); auto x = gko::initialize({0.0, 0.0, 0.0}, this->exec); @@ -334,6 +335,7 @@ TYPED_TEST(Cgs, SolvesDenseSystemMixed) { using value_type = next_precision; using Mtx = gko::matrix::Dense; + SKIP_IF_BFLOAT16(typename TestFixture::value_type); auto solver = this->cgs_factory->generate(this->mtx); auto b = gko::initialize({-1.0, 3.0, 1.0}, this->exec); auto x = gko::initialize({0.0, 0.0, 0.0}, this->exec); @@ -393,6 +395,7 @@ TYPED_TEST(Cgs, SolvesMultipleDenseSystem) using Mtx = typename TestFixture::Mtx; using value_type = typename TestFixture::value_type; using T = value_type; + SKIP_IF_BFLOAT16(value_type); auto half_tol = std::sqrt(r::value); auto solver = this->cgs_factory->generate(this->mtx); auto b = gko::initialize( @@ -498,6 +501,7 @@ TYPED_TEST(Cgs, SolvesMultipleDenseSystemsUsingAdvancedApply) using Mtx = typename TestFixture::Mtx; using value_type = typename TestFixture::value_type; using T = value_type; + SKIP_IF_BFLOAT16(value_type); auto half_tol = std::sqrt(r::value); auto solver = this->cgs_factory->generate(this->mtx); auto alpha = gko::initialize({2.0}, this->exec); diff --git a/test/components/reduce_array_kernels.cpp b/test/components/reduce_array_kernels.cpp index 90f9d532f7a..490b0e90fdb 100644 --- a/test/components/reduce_array_kernels.cpp +++ b/test/components/reduce_array_kernels.cpp @@ -52,8 +52,13 @@ template class ReduceArray : public CommonTestFixture { protected: using value_type = T; + // In bfloat16, 256 + 1 -> 256. The reference gets 256 but parallel version + // doesn't due to ordering and grouping. ReduceArray() - : total_size(1024), + : total_size( + (std::is_same, gko::bfloat16>::value) + ? 254 + : 1024), out{ref, I{2}}, dout{exec, out}, vals{ref, total_size}, diff --git a/test/matrix/fbcsr_kernels.cpp b/test/matrix/fbcsr_kernels.cpp index 6e0fbe555dc..3571a9db5fb 100644 --- a/test/matrix/fbcsr_kernels.cpp +++ b/test/matrix/fbcsr_kernels.cpp @@ -160,6 +160,7 @@ TYPED_TEST(Fbcsr, SpmvIsEquivalentToRefSorted) using value_type = typename Mtx::value_type; if (this->exec->get_master() != this->exec) { SKIP_IF_HALF(value_type); + SKIP_IF_BFLOAT16(value_type); } auto drand = gko::clone(this->exec, this->rsorted); auto x = @@ -185,6 +186,7 @@ TYPED_TEST(Fbcsr, SpmvMultiIsEquivalentToRefSorted) using value_type = typename Mtx::value_type; if (this->exec->get_master() != this->exec) { SKIP_IF_HALF(value_type); + SKIP_IF_BFLOAT16(value_type); } auto drand = gko::clone(this->exec, this->rsorted); auto x = @@ -211,6 +213,7 @@ TYPED_TEST(Fbcsr, AdvancedSpmvIsEquivalentToRefSorted) using real_type = typename TestFixture::real_type; if (this->exec->get_master() != this->exec) { SKIP_IF_HALF(value_type); + SKIP_IF_BFLOAT16(value_type); } auto drand = gko::clone(this->exec, this->rsorted); auto x = @@ -244,6 +247,7 @@ TYPED_TEST(Fbcsr, AdvancedSpmvMultiIsEquivalentToRefSorted) using real_type = typename TestFixture::real_type; if (this->exec->get_master() != this->exec) { SKIP_IF_HALF(value_type); + SKIP_IF_BFLOAT16(value_type); } auto drand = gko::clone(this->exec, this->rsorted); auto x = diff --git a/test/mpi/matrix.cpp b/test/mpi/matrix.cpp index 0c1fc2c1a36..1814750f803 100644 --- a/test/mpi/matrix.cpp +++ b/test/mpi/matrix.cpp @@ -360,9 +360,9 @@ TYPED_TEST(Matrix, CanApplyToMultipleVectors) using value_type = typename TestFixture::value_type; using index_type = typename TestFixture::global_index_type; auto vec_md = gko::matrix_data{ - I>{{1, 11}, {2, 22}, {3, 33}, {4, 44}, {5, 55}}}; + I>{{1, 6}, {2, 7}, {3, 8}, {4, 9}, {5, 10}}}; I> result[3] = { - {{10, 110}, {18, 198}}, {{28, 308}, {67, 737}}, {{59, 649}}}; + {{10, 25}, {18, 53}}, {{28, 83}, {67, 142}}, {{59, 154}}}; auto rank = this->comm.rank(); this->x->read_distributed(vec_md, this->col_part); this->y->read_distributed(vec_md, this->row_part); From 4f0d12a09516c5ee2f865da2d9059dc4ddd0f4cf Mon Sep 17 00:00:00 2001 From: "Yuhsiang M. Tsai" Date: Mon, 2 Oct 2023 15:51:00 +0200 Subject: [PATCH 42/48] fix missing type, use bfloat16 op from cudaarch80 --- core/matrix/dense.cpp | 3 ++- include/ginkgo/core/base/half.hpp | 7 +++---- include/ginkgo/core/base/precision_dispatch.hpp | 12 ++++++++++++ 3 files changed, 17 insertions(+), 5 deletions(-) diff --git a/core/matrix/dense.cpp b/core/matrix/dense.cpp index 24f1d200375..bc37226bcd9 100644 --- a/core/matrix/dense.cpp +++ b/core/matrix/dense.cpp @@ -1393,7 +1393,8 @@ void gather_mixed_real_complex(Function fn, LinOp* out) using fst_type = matrix::Dense; using snd_type = matrix::Dense>; using trd_type = matrix::Dense>>; - run(out, fn); + using fth_type = matrix::Dense>; + run(out, fn); #else precision_dispatch(fn, out); #endif diff --git a/include/ginkgo/core/base/half.hpp b/include/ginkgo/core/base/half.hpp index c5b0f09a8ad..0c2bac4f27d 100644 --- a/include/ginkgo/core/base/half.hpp +++ b/include/ginkgo/core/base/half.hpp @@ -47,8 +47,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include -#if defined(__CUDACC__) - +#if defined(__CUDACC__) && (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 800)) #define BFLOAT_FRIEND_OPERATOR(_op, _opeq) \ __forceinline__ __device__ __nv_bfloat16 operator _op( \ const __nv_bfloat16& lhs, const __nv_bfloat16& rhs) \ @@ -433,7 +432,7 @@ class half { return static_cast(static_cast(lhf) \ _op static_cast(rhf)); \ } \ - GKO_ATTRIBUTES half& operator _opeq(const half & hf) \ + GKO_ATTRIBUTES half& operator _opeq(const half& hf) \ { \ auto result = *this _op hf; \ this->float2half(result); \ @@ -599,7 +598,7 @@ class bfloat16 { return static_cast(static_cast(lhf) \ _op static_cast(rhf)); \ } \ - GKO_ATTRIBUTES bfloat16& operator _opeq(const bfloat16 & hf) \ + GKO_ATTRIBUTES bfloat16& operator _opeq(const bfloat16& hf) \ { \ auto result = *this _op hf; \ this->float2bfloat16(result); \ diff --git a/include/ginkgo/core/base/precision_dispatch.hpp b/include/ginkgo/core/base/precision_dispatch.hpp index e63c05fec30..27714266c40 100644 --- a/include/ginkgo/core/base/precision_dispatch.hpp +++ b/include/ginkgo/core/base/precision_dispatch.hpp @@ -297,6 +297,18 @@ void mixed_precision_dispatch(Function fn, const LinOp* in, LinOp* out) } else { GKO_NOT_SUPPORTED(out); } + } else if (auto dense_in = dynamic_cast(in)) { + if (auto dense_out = dynamic_cast(out)) { + fn(dense_in, dense_out); + } else if (auto dense_out = dynamic_cast(out)) { + fn(dense_in, dense_out); + } else if (auto dense_out = dynamic_cast(out)) { + fn(dense_in, dense_out); + } else if (auto dense_out = dynamic_cast(out)) { + fn(dense_in, dense_out); + } else { + GKO_NOT_SUPPORTED(out); + } } else { GKO_NOT_SUPPORTED(in); } From 8364204b8bdf857525f032b13629f8825a5a5ee5 Mon Sep 17 00:00:00 2001 From: "Yuhsiang M. Tsai" Date: Mon, 2 Oct 2023 21:34:37 +0200 Subject: [PATCH 43/48] fix hip hip does not support float -> bfloat16 implicit conversion bfloat a = float() does not work --- .../preconditioner/isai_kernels.hpp.inc | 5 +- .../unified/components/fill_array_kernels.cpp | 12 ++-- .../precision_conversion_kernels.cpp | 3 +- .../unified/matrix/dense_kernels.template.cpp | 3 +- hip/base/types.hip.hpp | 67 +++++++++++++++++++ hip/matrix/csr_kernels.instantiate.hip.cpp | 24 +++++++ hip/test/matrix/fbcsr_kernels.cpp | 12 ++-- include/ginkgo/core/base/math.hpp | 4 +- 8 files changed, 114 insertions(+), 16 deletions(-) diff --git a/common/cuda_hip/preconditioner/isai_kernels.hpp.inc b/common/cuda_hip/preconditioner/isai_kernels.hpp.inc index ce46925ef58..5ac21caf318 100644 --- a/common/cuda_hip/preconditioner/isai_kernels.hpp.inc +++ b/common/cuda_hip/preconditioner/isai_kernels.hpp.inc @@ -290,8 +290,9 @@ __global__ __launch_bounds__(default_block_size) void generate_general_inverse( ValueType sol = subwarp.shfl(rhs, perm); if (spd) { - auto diag = subwarp.shfl(sol, num_elems - 1); - sol /= sqrt(diag); + ValueType diag = subwarp.shfl(sol, num_elems - 1); + // TODO: check why HIP bfloat16 return float + sol /= static_cast(sqrt(diag)); } return sol; diff --git a/common/unified/components/fill_array_kernels.cpp b/common/unified/components/fill_array_kernels.cpp index 97dc9faeb62..47be03f6951 100644 --- a/common/unified/components/fill_array_kernels.cpp +++ b/common/unified/components/fill_array_kernels.cpp @@ -63,11 +63,15 @@ void fill_seq_array(std::shared_ptr exec, run_kernel( exec, [] GKO_KERNEL(auto idx, auto array) { + // hip bfloat16 does not provide implicit conversion array[idx] = static_cast, __nv_bfloat16>::value || - std::is_same, - thrust::complex<__nv_bfloat16>>::value, - float, long long>::type>(idx); + std::is_same>, + __nv_bfloat16>::value, + float, + typename std::conditional< + std::is_same>, + hip_bfloat16>::value, + hip_bfloat16, long long>::type>::type>(idx); }, n, array); } diff --git a/common/unified/components/precision_conversion_kernels.cpp b/common/unified/components/precision_conversion_kernels.cpp index 222e9471f96..310189c64c0 100644 --- a/common/unified/components/precision_conversion_kernels.cpp +++ b/common/unified/components/precision_conversion_kernels.cpp @@ -56,8 +56,7 @@ void convert_precision(std::shared_ptr exec, typename std::remove_reference::type>::type; using out_type = typename std::remove_cv< typename std::remove_reference::type>::type; - out[idx] = - static_cast>(in[idx]); + out[idx] = static_cast(in[idx]); }, size, in, out); } diff --git a/common/unified/matrix/dense_kernels.template.cpp b/common/unified/matrix/dense_kernels.template.cpp index 3a4ab98e5f6..5f5d28564c6 100644 --- a/common/unified/matrix/dense_kernels.template.cpp +++ b/common/unified/matrix/dense_kernels.template.cpp @@ -62,8 +62,7 @@ void copy(std::shared_ptr exec, run_kernel( exec, [] GKO_KERNEL(auto row, auto col, auto input, auto output) { - using type = - device_type>; + using type = device_type; output(row, col) = static_cast(input(row, col)); }, input->get_size(), input, output); diff --git a/hip/base/types.hip.hpp b/hip/base/types.hip.hpp index aa792e4edb7..5f9943a4c5a 100644 --- a/hip/base/types.hip.hpp +++ b/hip/base/types.hip.hpp @@ -40,6 +40,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include #include #include #include @@ -68,6 +69,18 @@ __device__ __forceinline__ thrust::complex<__half> sqrt( return sqrt(static_cast>(a)); } +__device__ __forceinline__ hip_bfloat16 hypot(hip_bfloat16 a, hip_bfloat16 b) +{ + return static_cast( + hypot(static_cast(a), static_cast(b))); +} + +__device__ __forceinline__ thrust::complex sqrt( + thrust::complex a) +{ + return sqrt(static_cast>(a)); +} + __device__ __forceinline__ thrust::complex sqrt( thrust::complex val) { @@ -84,8 +97,17 @@ __device__ __forceinline__ __half sqrt(__half val) { return sqrt(static_cast(val)); } + +__device__ __forceinline__ hip_bfloat16 sqrt(hip_bfloat16 val) +{ + return sqrt(static_cast(val)); +} #else __device__ __forceinline__ __half sqrt(__half val) { return hsqrt(val); } +__device__ __forceinline__ hip_bfloat16 sqrt(hip_bfloat16 val) +{ + return static_cast(sqrt(static_cast(val))); +} #endif @@ -99,6 +121,14 @@ GKO_ATTRIBUTES GKO_INLINE __half abs<__half>(const complex<__half>& z) return hypot(static_cast(z.real()), static_cast(z.imag())); } +template <> +GKO_ATTRIBUTES GKO_INLINE hip_bfloat16 +abs(const complex& z) +{ + return static_cast( + hypot(static_cast(z.real()), static_cast(z.imag()))); +} + } // namespace thrust @@ -107,6 +137,12 @@ GKO_ATTRIBUTES GKO_INLINE __half abs<__half>(const complex<__half>& z) const thrust::complex<__half> lhs, const thrust::complex<__half> rhs) \ { \ return thrust::complex{lhs} _op thrust::complex(rhs); \ + } \ + GKO_ATTRIBUTES GKO_INLINE thrust::complex operator _op( \ + const thrust::complex lhs, \ + const thrust::complex rhs) \ + { \ + return thrust::complex{lhs} _op thrust::complex(rhs); \ } THRUST_HALF_FRIEND_OPERATOR(+, +=) @@ -155,6 +191,15 @@ __device__ __forceinline__ __half abs(const __half& val) { return __habs(val); } #endif +__device__ __forceinline__ bool is_nan(const hip_bfloat16& val) +{ + return is_nan(static_cast(val)); +} + +__device__ __forceinline__ hip_bfloat16 abs(const hip_bfloat16& val) +{ + return static_cast(abs(static_cast(val))); +} namespace kernels { namespace hip { @@ -267,6 +312,16 @@ struct hiplibs_type_impl> { using type = __half2; }; +template <> +struct hiplibs_type_impl { + using type = hip_bfloat16; +}; + +template <> +struct hiplibs_type_impl> { + // TODO: HIP does not support it. + using type = __half2; +}; template struct hiplibs_type_impl> { @@ -345,6 +400,11 @@ struct hip_type_impl { using type = __half; }; +template <> +struct hip_type_impl { + using type = hip_bfloat16; +}; + template struct hip_type_impl> { using type = thrust::complex::type>; @@ -365,6 +425,8 @@ struct hip_type_impl<__half2> { using type = thrust::complex<__half>; }; +// TODO: hip does not support hip_bfloat162 + template struct hip_struct_member_type_impl { using type = T; @@ -380,6 +442,11 @@ struct hip_struct_member_type_impl { using type = __half; }; +template <> +struct hip_struct_member_type_impl { + using type = hip_bfloat16; +}; + template struct hip_type_impl> { using type = diff --git a/hip/matrix/csr_kernels.instantiate.hip.cpp b/hip/matrix/csr_kernels.instantiate.hip.cpp index c6c5fe4afe3..a0cb622eeca 100644 --- a/hip/matrix/csr_kernels.instantiate.hip.cpp +++ b/hip/matrix/csr_kernels.instantiate.hip.cpp @@ -68,6 +68,12 @@ GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT5(GKO_DECLARE_CSR_SPMV_KERNEL, GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT6(GKO_DECLARE_CSR_SPMV_KERNEL, int32); // split +GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT7(GKO_DECLARE_CSR_SPMV_KERNEL, + int32); +// split +GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT8(GKO_DECLARE_CSR_SPMV_KERNEL, + int32); +// split GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT1(GKO_DECLARE_CSR_SPMV_KERNEL, int64); // split @@ -85,6 +91,12 @@ GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT5(GKO_DECLARE_CSR_SPMV_KERNEL, // split GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT6(GKO_DECLARE_CSR_SPMV_KERNEL, int64); +// split +GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT7(GKO_DECLARE_CSR_SPMV_KERNEL, + int64); +// split +GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT8(GKO_DECLARE_CSR_SPMV_KERNEL, + int64); // split @@ -106,6 +118,12 @@ GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT5( GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT6( GKO_DECLARE_CSR_ADVANCED_SPMV_KERNEL, int32); // split +GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT7( + GKO_DECLARE_CSR_ADVANCED_SPMV_KERNEL, int32); +// split +GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT8( + GKO_DECLARE_CSR_ADVANCED_SPMV_KERNEL, int32); +// split GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT1( GKO_DECLARE_CSR_ADVANCED_SPMV_KERNEL, int64); // split @@ -123,6 +141,12 @@ GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT5( // split GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT6( GKO_DECLARE_CSR_ADVANCED_SPMV_KERNEL, int64); +// split +GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT7( + GKO_DECLARE_CSR_ADVANCED_SPMV_KERNEL, int64); +// split +GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT8( + GKO_DECLARE_CSR_ADVANCED_SPMV_KERNEL, int64); // split diff --git a/hip/test/matrix/fbcsr_kernels.cpp b/hip/test/matrix/fbcsr_kernels.cpp index e8c87957c73..a8611beddea 100644 --- a/hip/test/matrix/fbcsr_kernels.cpp +++ b/hip/test/matrix/fbcsr_kernels.cpp @@ -178,7 +178,8 @@ TYPED_TEST(Fbcsr, SpmvIsEquivalentToRefSorted) this->ref, gko::dim<2>(this->rsorted_ref->get_size()[0], 1)); auto prod_hip = Dense::create(this->exec, prod_ref->get_size()); - if (std::is_same::value) { + if (std::is_same::value || + std::is_same::value) { ASSERT_THROW(rand_hip->apply(x_hip, prod_hip), gko::NotImplemented); } else { rand_hip->apply(x_hip, prod_hip); @@ -206,7 +207,8 @@ TYPED_TEST(Fbcsr, SpmvMultiIsEquivalentToRefSorted) this->ref, gko::dim<2>(this->rsorted_ref->get_size()[0], 3)); auto prod_hip = Dense::create(this->exec, prod_ref->get_size()); - if (std::is_same::value) { + if (std::is_same::value || + std::is_same::value) { ASSERT_THROW(rand_hip->apply(x_hip, prod_hip), gko::NotImplemented); } else { rand_hip->apply(x_hip, prod_hip); @@ -246,7 +248,8 @@ TYPED_TEST(Fbcsr, AdvancedSpmvIsEquivalentToRefSorted) auto beta = Dense::create(this->exec); beta->copy_from(beta_ref); - if (std::is_same::value) { + if (std::is_same::value || + std::is_same::value) { ASSERT_THROW(rand_hip->apply(alpha, x_hip, beta, prod_hip), gko::NotImplemented); } else { @@ -287,7 +290,8 @@ TYPED_TEST(Fbcsr, AdvancedSpmvMultiIsEquivalentToRefSorted) auto beta = Dense::create(this->exec); beta->copy_from(beta_ref); - if (std::is_same::value) { + if (std::is_same::value || + std::is_same::value) { ASSERT_THROW(rand_hip->apply(alpha, x_hip, beta, prod_hip), gko::NotImplemented); } else { diff --git a/include/ginkgo/core/base/math.hpp b/include/ginkgo/core/base/math.hpp index b713ede64db..09fdef5bea4 100644 --- a/include/ginkgo/core/base/math.hpp +++ b/include/ginkgo/core/base/math.hpp @@ -922,7 +922,7 @@ GKO_INLINE __host__ constexpr T zero(const T&) template GKO_INLINE __host__ constexpr T one() { - return T(1); + return T(static_cast>(1.0)); } @@ -982,7 +982,7 @@ GKO_INLINE __device__ constexpr std::enable_if_t< !std::is_same>>::value, T> one() { - return T(1); + return T(static_cast>(1.0)); } From 95ac5ea6d868213bebd54ff157849a324c59eae8 Mon Sep 17 00:00:00 2001 From: "Yuhsiang M. Tsai" Date: Mon, 2 Oct 2023 21:35:23 +0200 Subject: [PATCH 44/48] fix dpcpp --- dpcpp/matrix/csr_kernels.dp.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/dpcpp/matrix/csr_kernels.dp.cpp b/dpcpp/matrix/csr_kernels.dp.cpp index 863cd7e2520..2ae9022edf9 100644 --- a/dpcpp/matrix/csr_kernels.dp.cpp +++ b/dpcpp/matrix/csr_kernels.dp.cpp @@ -1248,7 +1248,9 @@ bool try_general_sparselib_spmv(std::shared_ptr exec, matrix::Dense* c) { constexpr bool try_sparselib = - !is_complex() && !std::is_same::value; + !is_complex() && + !std::is_same::value && + !std::is_same::value; if constexpr (try_sparselib) { oneapi::mkl::sparse::matrix_handle_t mat_handle; oneapi::mkl::sparse::init_matrix_handle(&mat_handle); From 653cd3654ba797ff33dc241dd99a005f732d7b03 Mon Sep 17 00:00:00 2001 From: "Yuhsiang M. Tsai" Date: Mon, 2 Oct 2023 21:35:32 +0200 Subject: [PATCH 45/48] fix nvhpc --- include/ginkgo/core/base/half.hpp | 2 +- omp/components/atomic.hpp | 9 +++++++++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/include/ginkgo/core/base/half.hpp b/include/ginkgo/core/base/half.hpp index 0c2bac4f27d..f6d9fcc320a 100644 --- a/include/ginkgo/core/base/half.hpp +++ b/include/ginkgo/core/base/half.hpp @@ -562,7 +562,7 @@ class half { class bfloat16 { public: - GKO_ATTRIBUTES bfloat16() noexcept = default; + GKO_ATTRIBUTES bfloat16() noexcept : data_(0) {} template ::value>> GKO_ATTRIBUTES bfloat16(const T val) diff --git a/omp/components/atomic.hpp b/omp/components/atomic.hpp index ceb5bda6661..90c754907d1 100644 --- a/omp/components/atomic.hpp +++ b/omp/components/atomic.hpp @@ -108,6 +108,14 @@ void atomic_add(half& out, half val) template <> void atomic_add(bfloat16& out, bfloat16 val) { +#ifdef __NVCOMPILER +// NVC++ uses atomic capture on uint16 leads the following error. +// use of undefined value '%L.B*' br label %L.B* !llvm.loop !*, !dbg !* +#pragma omp critical + { + out += val; + } +#else // UB? uint16_t* address_as_converter = reinterpret_cast(&out); uint16_t old = *address_as_converter; @@ -122,6 +130,7 @@ void atomic_add(bfloat16& out, bfloat16 val) *address_as_converter = (old == assumed) ? answer : old; } } while (assumed != old); +#endif } From 50cf5b23f121c94ee8eb6f6e5393a793f273eb5b Mon Sep 17 00:00:00 2001 From: "Yuhsiang M. Tsai" Date: Tue, 3 Oct 2023 20:50:49 +0200 Subject: [PATCH 46/48] add the casting --- accessor/reduced_row_major_reference.hpp | 5 +++-- common/unified/matrix/dense_kernels.template.cpp | 7 ++++--- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/accessor/reduced_row_major_reference.hpp b/accessor/reduced_row_major_reference.hpp index 10960316eb2..59122160835 100644 --- a/accessor/reduced_row_major_reference.hpp +++ b/accessor/reduced_row_major_reference.hpp @@ -103,7 +103,7 @@ class reduced_storage operator=(arithmetic_type val) && { storage_type* const GKO_ACC_RESTRICT r_ptr = ptr_; - *r_ptr = val; + *r_ptr = detail::implicit_explicit_conversion(val); return val; } @@ -115,7 +115,8 @@ class reduced_storage } constexpr GKO_ACC_ATTRIBUTES arithmetic_type - operator=(reduced_storage&& ref) && noexcept + operator=(reduced_storage&& ref) && + noexcept { std::move(*this) = ref.implicit_conversion(); return *this; diff --git a/common/unified/matrix/dense_kernels.template.cpp b/common/unified/matrix/dense_kernels.template.cpp index 5f5d28564c6..d6cda937fdf 100644 --- a/common/unified/matrix/dense_kernels.template.cpp +++ b/common/unified/matrix/dense_kernels.template.cpp @@ -405,7 +405,8 @@ void row_gather(std::shared_ptr exec, run_kernel( exec, [] GKO_KERNEL(auto row, auto col, auto orig, auto rows, auto gathered) { - gathered(row, col) = orig(rows[row], col); + gathered(row, col) = + static_cast>(orig(rows[row], col)); }, dim<2>{row_idxs->get_num_elems(), orig->get_size()[1]}, orig, *row_idxs, row_collection); @@ -425,10 +426,10 @@ void advanced_row_gather(std::shared_ptr exec, [] GKO_KERNEL(auto row, auto col, auto alpha, auto orig, auto rows, auto beta, auto gathered) { using type = device_type>; - gathered(row, col) = + gathered(row, col) = static_cast>( static_cast(alpha[0] * orig(rows[row], col)) + static_cast(beta[0]) * - static_cast(gathered(row, col)); + static_cast(gathered(row, col))); }, dim<2>{row_idxs->get_num_elems(), orig->get_size()[1]}, alpha->get_const_values(), orig, *row_idxs, beta->get_const_values(), From 51ab0b0187f1232114e1332fe91e0fda76f167fd Mon Sep 17 00:00:00 2001 From: "Yu-Hsiang M. Tsai" Date: Wed, 4 Oct 2023 11:41:59 +0200 Subject: [PATCH 47/48] use float as the bridge between bfloat16 and half --- .../components/precision_conversion_kernels.cpp | 11 ++++++----- common/unified/matrix/dense_kernels.template.cpp | 12 +++++++++--- 2 files changed, 15 insertions(+), 8 deletions(-) diff --git a/common/unified/components/precision_conversion_kernels.cpp b/common/unified/components/precision_conversion_kernels.cpp index 310189c64c0..df1cd9fa062 100644 --- a/common/unified/components/precision_conversion_kernels.cpp +++ b/common/unified/components/precision_conversion_kernels.cpp @@ -52,11 +52,12 @@ void convert_precision(std::shared_ptr exec, run_kernel( exec, [] GKO_KERNEL(auto idx, auto in, auto out) { - using in_type = typename std::remove_cv< - typename std::remove_reference::type>::type; - using out_type = typename std::remove_cv< - typename std::remove_reference::type>::type; - out[idx] = static_cast(in[idx]); + using target_type = device_type; + using arithmetic_type = + highest_precision>; + // use float as the bridge between bfloat16 and half on device + out[idx] = + static_cast(static_cast(in[idx])); }, size, in, out); } diff --git a/common/unified/matrix/dense_kernels.template.cpp b/common/unified/matrix/dense_kernels.template.cpp index d6cda937fdf..81d7543d79c 100644 --- a/common/unified/matrix/dense_kernels.template.cpp +++ b/common/unified/matrix/dense_kernels.template.cpp @@ -63,7 +63,10 @@ void copy(std::shared_ptr exec, exec, [] GKO_KERNEL(auto row, auto col, auto input, auto output) { using type = device_type; - output(row, col) = static_cast(input(row, col)); + using arithmetic_type = + highest_precision>; + output(row, col) = static_cast( + static_cast(input(row, col))); }, input->get_size(), input, output); } @@ -405,8 +408,11 @@ void row_gather(std::shared_ptr exec, run_kernel( exec, [] GKO_KERNEL(auto row, auto col, auto orig, auto rows, auto gathered) { - gathered(row, col) = - static_cast>(orig(rows[row], col)); + using output_type = device_type; + using arithmetic_type = + highest_precision>; + gathered(row, col) = static_cast( + static_cast(orig(rows[row], col))); }, dim<2>{row_idxs->get_num_elems(), orig->get_size()[1]}, orig, *row_idxs, row_collection); From b531ba31b96dc65a5f606a211bcebd8f5e3a2456 Mon Sep 17 00:00:00 2001 From: "Yu-Hsiang M. Tsai" Date: Wed, 4 Oct 2023 13:06:56 +0200 Subject: [PATCH 48/48] fix ell accessor type --- common/cuda_hip/matrix/ell_kernels.hpp.inc | 24 +++++++------ cuda/matrix/ell_kernels.cu | 6 ++-- dpcpp/matrix/ell_kernels.dp.cpp | 40 +++++++++++++--------- hip/matrix/ell_kernels.hip.cpp | 6 ++-- 4 files changed, 45 insertions(+), 31 deletions(-) diff --git a/common/cuda_hip/matrix/ell_kernels.hpp.inc b/common/cuda_hip/matrix/ell_kernels.hpp.inc index 8e8d6e89ae3..4c0a46f2193 100644 --- a/common/cuda_hip/matrix/ell_kernels.hpp.inc +++ b/common/cuda_hip/matrix/ell_kernels.hpp.inc @@ -43,13 +43,14 @@ __device__ void spmv_kernel( acc::range b, OutputValueType* __restrict__ c, const size_type c_stride, Closure op) { + using arithmetic_type = typename a_accessor::arithmetic_type; const auto tidx = thread::get_thread_id_flat(); const decltype(tidx) column_id = blockIdx.y; if (num_thread_per_worker == 1) { // Specialize the num_thread_per_worker = 1. It doesn't need the shared // memory, __syncthreads, and atomic_add if (tidx < num_rows) { - auto temp = zero(); + auto temp = zero(); for (size_type idx = 0; idx < num_stored_elements_per_row; idx++) { const auto ind = tidx + idx * stride; const auto col_idx = col[ind]; @@ -69,13 +70,13 @@ __device__ void spmv_kernel( const auto worker_id = tidx / num_rows; const auto step_size = num_worker_per_row * num_thread_per_worker; __shared__ uninitialized_array< - OutputValueType, default_block_size / num_thread_per_worker> + arithmetic_type, default_block_size / num_thread_per_worker> storage; if (idx_in_worker == 0) { storage[threadIdx.x] = gko::zero(); } __syncthreads(); - auto temp = zero(); + auto temp = zero(); for (size_type idx = worker_id * num_thread_per_worker + idx_in_worker; idx < num_stored_elements_per_row; idx += step_size) { @@ -114,7 +115,9 @@ __global__ __launch_bounds__(default_block_size) void spmv( spmv_kernel( num_rows, num_worker_per_row, val, col, stride, num_stored_elements_per_row, b, c, c_stride, - [](const OutputValueType& x, const OutputValueType& y) { return x; }); + [](const auto& x, const OutputValueType& y) { + return static_cast(x); + }); } @@ -128,7 +131,8 @@ __global__ __launch_bounds__(default_block_size) void spmv( const OutputValueType* __restrict__ beta, OutputValueType* __restrict__ c, const size_type c_stride) { - const OutputValueType alpha_val = alpha(0); + using arithmetic_type = typename a_accessor::arithmetic_type; + const auto alpha_val = alpha(0); const OutputValueType beta_val = beta[0]; if (atomic) { // Because the atomic operation changes the values of c during @@ -139,16 +143,16 @@ __global__ __launch_bounds__(default_block_size) void spmv( spmv_kernel( num_rows, num_worker_per_row, val, col, stride, num_stored_elements_per_row, b, c, c_stride, - [&alpha_val](const OutputValueType& x, const OutputValueType& y) { - return alpha_val * x; + [&alpha_val](const auto& x, const OutputValueType& y) { + return static_cast(alpha_val * x); }); } else { spmv_kernel( num_rows, num_worker_per_row, val, col, stride, num_stored_elements_per_row, b, c, c_stride, - [&alpha_val, &beta_val](const OutputValueType& x, - const OutputValueType& y) { - return alpha_val * x + beta_val * y; + [&alpha_val, &beta_val](const auto& x, const OutputValueType& y) { + return static_cast( + alpha_val * x + static_cast(beta_val * y)); }); } } diff --git a/cuda/matrix/ell_kernels.cu b/cuda/matrix/ell_kernels.cu index 124a4deda75..7b20236827e 100644 --- a/cuda/matrix/ell_kernels.cu +++ b/cuda/matrix/ell_kernels.cu @@ -122,10 +122,12 @@ void abstract_spmv(syn::value_list, const matrix::Dense* alpha = nullptr, const matrix::Dense* beta = nullptr) { + using arithmetic_type = + highest_precision; using a_accessor = - gko::acc::reduced_row_major<1, OutputValueType, const MatrixValueType>; + gko::acc::reduced_row_major<1, arithmetic_type, const MatrixValueType>; using b_accessor = - gko::acc::reduced_row_major<2, OutputValueType, const InputValueType>; + gko::acc::reduced_row_major<2, arithmetic_type, const InputValueType>; const auto nrows = a->get_size()[0]; const auto stride = a->get_stride(); diff --git a/dpcpp/matrix/ell_kernels.dp.cpp b/dpcpp/matrix/ell_kernels.dp.cpp index 65fad771140..4817b9a5991 100644 --- a/dpcpp/matrix/ell_kernels.dp.cpp +++ b/dpcpp/matrix/ell_kernels.dp.cpp @@ -120,16 +120,17 @@ void spmv_kernel( const size_type stride, const size_type num_stored_elements_per_row, acc::range b, OutputValueType* __restrict__ c, const size_type c_stride, Closure op, sycl::nd_item<3> item_ct1, - uninitialized_array& storage) { + using arithmetic_type = typename a_accessor::arithmetic_type; const auto tidx = thread::get_thread_id_flat(item_ct1); const decltype(tidx) column_id = item_ct1.get_group(1); if (num_thread_per_worker == 1) { // Specialize the num_thread_per_worker = 1. It doesn't need the shared // memory, __syncthreads, and atomic_add if (tidx < num_rows) { - auto temp = zero(); + auto temp = zero(); for (size_type idx = 0; idx < num_stored_elements_per_row; idx++) { const auto ind = tidx + idx * stride; const auto col_idx = col[ind]; @@ -150,11 +151,11 @@ void spmv_kernel( const auto step_size = num_worker_per_row * num_thread_per_worker; if (runnable && idx_in_worker == 0) { - storage[item_ct1.get_local_id(2)] = 0; + storage[item_ct1.get_local_id(2)] = zero(); } item_ct1.barrier(sycl::access::fence_space::local_space); - auto temp = zero(); + auto temp = zero(); if (runnable) { for (size_type idx = worker_id * num_thread_per_worker + idx_in_worker; @@ -193,13 +194,15 @@ void spmv( const size_type stride, const size_type num_stored_elements_per_row, acc::range b, OutputValueType* __restrict__ c, const size_type c_stride, sycl::nd_item<3> item_ct1, - uninitialized_array& storage) { spmv_kernel( num_rows, num_worker_per_row, val, col, stride, num_stored_elements_per_row, b, c, c_stride, - [](const OutputValueType& x, const OutputValueType& y) { return x; }, + [](const auto& x, const OutputValueType& y) { + return static_cast(x); + }, item_ct1, storage); } @@ -214,7 +217,7 @@ void spmv(dim3 grid, dim3 block, size_type dynamic_shared_memory, { queue->submit([&](sycl::handler& cgh) { sycl::accessor< - uninitialized_array, 0, sycl::access_mode::read_write, sycl::access::target::local> storage_acc_ct1(cgh); @@ -239,10 +242,11 @@ void spmv( const size_type num_stored_elements_per_row, acc::range b, const OutputValueType* __restrict__ beta, OutputValueType* __restrict__ c, const size_type c_stride, sycl::nd_item<3> item_ct1, - uninitialized_array& storage) { - const OutputValueType alpha_val = alpha(0); + using arithmetic_type = typename a_accessor::arithmetic_type; + const auto alpha_val = alpha(0); const OutputValueType beta_val = beta[0]; if (atomic) { // Because the atomic operation changes the values of c during @@ -253,17 +257,17 @@ void spmv( spmv_kernel( num_rows, num_worker_per_row, val, col, stride, num_stored_elements_per_row, b, c, c_stride, - [&alpha_val](const OutputValueType& x, const OutputValueType& y) { - return alpha_val * x; + [&alpha_val](const auto& x, const OutputValueType& y) { + return static_cast(alpha_val * x); }, item_ct1, storage); } else { spmv_kernel( num_rows, num_worker_per_row, val, col, stride, num_stored_elements_per_row, b, c, c_stride, - [&alpha_val, &beta_val](const OutputValueType& x, - const OutputValueType& y) { - return alpha_val * x + beta_val * y; + [&alpha_val, &beta_val](const auto& x, const OutputValueType& y) { + return static_cast( + alpha_val * x + static_cast(beta_val * y)); }, item_ct1, storage); } @@ -281,7 +285,7 @@ void spmv(dim3 grid, dim3 block, size_type dynamic_shared_memory, { queue->submit([&](sycl::handler& cgh) { sycl::accessor< - uninitialized_array, 0, sycl::access_mode::read_write, sycl::access::target::local> storage_acc_ct1(cgh); @@ -316,10 +320,12 @@ void abstract_spmv(syn::value_list, const matrix::Dense* alpha = nullptr, const matrix::Dense* beta = nullptr) { + using arithmetic_type = + highest_precision; using a_accessor = - gko::acc::reduced_row_major<1, OutputValueType, const MatrixValueType>; + gko::acc::reduced_row_major<1, arithmetic_type, const MatrixValueType>; using b_accessor = - gko::acc::reduced_row_major<2, OutputValueType, const InputValueType>; + gko::acc::reduced_row_major<2, arithmetic_type, const InputValueType>; const auto nrows = a->get_size()[0]; const auto stride = a->get_stride(); diff --git a/hip/matrix/ell_kernels.hip.cpp b/hip/matrix/ell_kernels.hip.cpp index db9d5aa11bb..1567548463f 100644 --- a/hip/matrix/ell_kernels.hip.cpp +++ b/hip/matrix/ell_kernels.hip.cpp @@ -125,10 +125,12 @@ void abstract_spmv(syn::value_list, const matrix::Dense* alpha = nullptr, const matrix::Dense* beta = nullptr) { + using arithmetic_type = + highest_precision; using a_accessor = - acc::reduced_row_major<1, OutputValueType, const MatrixValueType>; + acc::reduced_row_major<1, arithmetic_type, const MatrixValueType>; using b_accessor = - acc::reduced_row_major<2, OutputValueType, const InputValueType>; + acc::reduced_row_major<2, arithmetic_type, const InputValueType>; const auto nrows = a->get_size()[0]; const auto stride = a->get_stride();